#############################################################

# Data prep

library("rio")
x <- import("https://docs.google.com/spreadsheets/d/1h7XhLd2Byp4OcXSdtxHly9iS7RA673RJ/export?format=csv&gid=1083477596")
x$commentCount <- as.integer(x$commentCount)
x$viewsCount <- as.numeric(x$viewsCount)
x$acousticness <- as.numeric(sub(",", ".", x$acousticness, fixed = TRUE))
x$danceability <- as.numeric(sub(",", ".", x$danceability, fixed = TRUE))
x$energy <- as.numeric(sub(",", ".", x$energy, fixed = TRUE))
x$instrumentalness <- as.numeric(sub(",", ".", x$instrumentalness, fixed = TRUE))
x$liveness <- as.numeric(sub(",", ".", x$liveness, fixed = TRUE))
x$loudness <- as.numeric(sub(",", ".", x$loudness, fixed = TRUE))
x$speechiness <- as.numeric(sub(",", ".", x$speechiness, fixed = TRUE))
x$tempo <- as.numeric(sub(",", ".", x$tempo, fixed = TRUE))
x$valence <- as.numeric(sub(",", ".", x$valence, fixed = TRUE))
x$key <- as.factor(x$key)
x$time_signature <- as.factor(x$time_signature)
min_max_normalize <- function(x)
{
  return( (1000-10)*((x- min(x)) /(max(x)-min(x))) + 10)
}
x$acousticness <- min_max_normalize(x$acousticness)
x$danceability <- min_max_normalize(x$danceability)
x$energy <- min_max_normalize(x$energy)
x$instrumentalness <- min_max_normalize(x$instrumentalness)
x$liveness <- min_max_normalize(x$liveness)
x$loudness <- min_max_normalize(x$loudness)
x$speechiness <- min_max_normalize(x$speechiness)
x$tempo <- min_max_normalize(x$tempo)
x$valence <- min_max_normalize(x$valence)
x$Genre <- as.factor(x$Genre)
x$key <- as.factor(x$key)
x$mode <- as.factor(x$mode)
x$Charts <- as.factor(x$Charts)

library("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
x <- dplyr::select(x, -one_of('Streams'))
x <- x[complete.cases(x), ]
var_list <- c('Artist_Albums_Number', 'Artist_Albums_Tracks_Number', 'Artist_Appearances_Number', 'Artist_Appearances_Tracks_Number', 'Artist_Compilations_Number', 'Artist_Compilations_Tracks_Number', 'Artist_Follower', 'Artist_Popularity', 'Artist_Singles_Number', 'Artist_Singles_Tracks_Number', 'Track_Duration_ms', 'Track_Popularity', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'commentCount', 'dislikeCount', 'likeCount', 'viewsCount')

par(mar = rep(2, 4))

par(mfrow=c(5,5))

for (i in 1:length(var_list)){
  
  hist(x[, var_list[i]], probability = TRUE, col = "gray", main = var_list[i], xlab = "")
  lines(density(x[, var_list[i]]), col = "red")

}

par(mar = rep(2, 4))

par(mfrow=c(5,5))

for(i in 1:length(var_list)){
  
  df <- data.frame(x[,var_list[i]])
  names(df)[1] <- var_list[i]
  
  #ggplot(df, aes(x="",y=Artist_Albums_Number)) + stat_boxplot(geom ='errorbar') + geom_boxplot(outlier.colour="black", outlier.shape=16, #outlier.size=2, notch=FALSE)
  
  boxplot(x = df[,1], main = var_list[i], notch = FALSE)

}

Normality testing

#strictly_positive_variables <- c('Artist_Follower', 'Artist_Popularity', 'Track_Duration_ms', 'Track_Popularity', 'viewsCount', #'acousticness', 'danceability', 'energy', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence')

for (i in 1:length(var_list)){
  
  column_name <- var_list[i]
  
  message(column_name)
  
  sub_df <- as.numeric(x[,column_name])
  
  # sub_df <- sub_df[complete.cases(sub_df), ]
  
  #sub_df <- as.numeric(as.character(unlist(sub_df[[1]])))
  
  test_statistic <- ks.test(sub_df, "pnorm", mean=mean(sub_df), sd=sd(sub_df))$statistic
  critical_value <- 1.3581 / sqrt (length(sub_df))
  
  if (test_statistic > critical_value) {
message(column_name)
} else {
message(paste(" ", column_name , " is approximately normally distributed!", test_statistic, critical_value))  
}}
## Artist_Albums_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Albums_Number
## Artist_Albums_Tracks_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Albums_Tracks_Number
## Artist_Appearances_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Appearances_Number
## Artist_Appearances_Tracks_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Appearances_Tracks_Number
## Artist_Compilations_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Compilations_Number
## Artist_Compilations_Tracks_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Compilations_Tracks_Number
## Artist_Follower
## Artist_Follower
## Artist_Popularity
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
##   Artist_Popularity  is approximately normally distributed! 0.0579112990955349 0.0970071428571429
## Artist_Singles_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Singles_Number
## Artist_Singles_Tracks_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Singles_Tracks_Number
## Track_Duration_ms
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Track_Duration_ms
## Track_Popularity
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Track_Popularity
## acousticness
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## acousticness
## danceability
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## danceability
## energy
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## energy
## instrumentalness
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## instrumentalness
## liveness
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## liveness
## loudness
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## loudness
## speechiness
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## speechiness
## tempo
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## tempo
## valence
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## valence
## commentCount
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## commentCount
## dislikeCount
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## dislikeCount
## likeCount
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## likeCount
## viewsCount
## viewsCount
x_scaled <- as.data.frame(scale(x[, var_list]))

par(mar = rep(2, 4))

par(mfrow=c(5,5))

for (i in 1:length(var_list)){
  
  hist(x_scaled[, var_list[i]], probability = TRUE, col = "gray", main = var_list[i], xlab = "")
  lines(density(x_scaled[, var_list[i]]), col = "red")

}

par(mar = rep(2, 4))

par(mfrow=c(5,5))

for(i in 1:length(var_list)){
  
  df <- data.frame(x_scaled[,var_list[i]])
  names(df)[1] <- var_list[i]
  
  #ggplot(df, aes(x="",y=Artist_Albums_Number)) + stat_boxplot(geom ='errorbar') + geom_boxplot(outlier.colour="black", outlier.shape=16, #outlier.size=2, notch=FALSE)
  
  boxplot(x = df[,1], main = var_list[i], notch = FALSE)

}

for (i in 1:length(var_list)){
  
  column_name <- var_list[i]
  
  message(column_name)
  
  sub_df <- as.numeric(x_scaled[,column_name])
  
  # sub_df <- sub_df[complete.cases(sub_df), ]
  
  #sub_df <- as.numeric(as.character(unlist(sub_df[[1]])))
  
  test_statistic <- ks.test(sub_df, "pnorm", mean=mean(sub_df), sd=sd(sub_df))$statistic
  critical_value <- 1.3581 / sqrt (length(sub_df))
  
  if (test_statistic > critical_value) {
message(column_name)
} else {
message(paste(" ", column_name , " is approximately normally distributed!", test_statistic, critical_value))  
}}
## Artist_Albums_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Albums_Number
## Artist_Albums_Tracks_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Albums_Tracks_Number
## Artist_Appearances_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Appearances_Number
## Artist_Appearances_Tracks_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Appearances_Tracks_Number
## Artist_Compilations_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Compilations_Number
## Artist_Compilations_Tracks_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Compilations_Tracks_Number
## Artist_Follower
## Artist_Follower
## Artist_Popularity
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
##   Artist_Popularity  is approximately normally distributed! 0.0579112990955349 0.0970071428571429
## Artist_Singles_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Singles_Number
## Artist_Singles_Tracks_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Singles_Tracks_Number
## Track_Duration_ms
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Track_Duration_ms
## Track_Popularity
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Track_Popularity
## acousticness
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## acousticness
## danceability
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## danceability
## energy
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## energy
## instrumentalness
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## instrumentalness
## liveness
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## liveness
## loudness
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## loudness
## speechiness
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## speechiness
## tempo
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## tempo
## valence
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## valence
## commentCount
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## commentCount
## dislikeCount
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## dislikeCount
## likeCount
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## likeCount
## viewsCount
## viewsCount
strictly_positive_variables <- c('Artist_Follower', 'Artist_Popularity', 'Artist_Singles_Number', 'Artist_Singles_Tracks_Number', 'Track_Duration_ms', 'Track_Popularity', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'likeCount', 'viewsCount')

library("psych")
library("car")
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
## The following object is masked from 'package:dplyr':
## 
##     recode
ksD <- function (p, x) {
  y <- bcPower(x, p)
  ks.test(y, "pnorm", mean=mean(y), sd=sd(y))$statistic
}

oldw <- getOption("warn")
options(warn = -1)

min_values <- c()

for (column_index in 1:length(strictly_positive_variables)){
  
  column_name <- strictly_positive_variables[column_index]
  
  x_sub <- x[[paste(column_name)]]
  
  result <- optimize(ksD, c(-5,5), x=x_sub)
  
  min_values[column_index] <- result$minimum
  
  message(paste(column_index, ', minimum value is: ', result$minimum))
  
}
## 1 , minimum value is:  -0.0049316395173003
## 2 , minimum value is:  1.40215894049501
## 3 , minimum value is:  0.240525077020065
## 4 , minimum value is:  0.262671909258923
## 5 , minimum value is:  -0.699387491055179
## 6 , minimum value is:  0.949365177476973
## 7 , minimum value is:  0.130600084313768
## 8 , minimum value is:  3.51730755217668
## 9 , minimum value is:  3.07785781966221
## 10 , minimum value is:  -0.795345146822112
## 11 , minimum value is:  -0.303366878680729
## 12 , minimum value is:  4.64767393430379
## 13 , minimum value is:  -0.403363137393642
## 14 , minimum value is:  1.16379372652619
## 15 , minimum value is:  0.453314295028104
## 16 , minimum value is:  0.05466614868127
## 17 , minimum value is:  0.0345455532264414
options(warn = oldw)
column_index <- 1
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
Artist_Follower_trans <- bcPower(x_sub, min_values[column_index])

column_index <- 2
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
Artist_Popularity_trans <- bcPower(x_sub, min_values[column_index])

column_index <- 3
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
Artist_Singles_Number_trans <- bcPower(x_sub, min_values[column_index])

column_index <- 4
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
Artist_Singles_Tracks_Number_trans <- bcPower(x_sub, min_values[column_index])

column_index <- 5
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
Track_Duration_ms_trans <- bcPower(x_sub, min_values[column_index])

column_index <- 6
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
Track_Popularity_trans <- bcPower(x_sub, min_values[column_index])

column_index <- 7
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
acousticness_trans <- bcPower(x_sub, min_values[column_index])

column_index <- 8
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
danceability_trans <- bcPower(x_sub, min_values[column_index])

column_index <- 9
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
energy_trans <- bcPower(x_sub, min_values[column_index])

column_index <- 10
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
instrumentalness_trans <- bcPower(x_sub, min_values[column_index])

column_index <- 11
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
liveness_trans <- bcPower(x_sub, min_values[column_index])

column_index <- 12
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
loudness_trans <- bcPower(x_sub, min_values[column_index])

column_index <- 13
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
speechiness_trans <- bcPower(x_sub, min_values[column_index])

column_index <- 14
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
tempo_trans <- bcPower(x_sub, min_values[column_index])

column_index <- 15
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
valence_trans <- bcPower(x_sub, min_values[column_index])

column_index <- 16
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
likeCount_trans <- bcPower(x_sub, min_values[column_index])

column_index <- 17
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
viewsCount_trans <- bcPower(x_sub, min_values[column_index])
hist_trans_list <- list(Artist_Follower_trans, 
                        Artist_Popularity_trans,
                        Artist_Singles_Number_trans,
                        Artist_Singles_Tracks_Number_trans,
                        Track_Duration_ms_trans, 
                        Track_Popularity_trans,
                     acousticness_trans, 
                     danceability_trans, 
                     energy_trans, 
                     instrumentalness_trans,
                     liveness_trans, 
                     loudness_trans, 
                     speechiness_trans, 
                     tempo_trans, 
                     valence_trans,
                     likeCount_trans,
                     viewsCount_trans)

par(mar = rep(2, 4))
par(mfrow=c(4,5))

for (trans_index in 1:length(hist_trans_list)){
  
  column_name <- strictly_positive_variables[trans_index]
  
  selected_trans <- hist_trans_list[trans_index]
  selected_trans <- as.numeric(as.character(unlist(selected_trans[[1]])))
  
  hist(selected_trans, col = "gray", probability = TRUE, main = column_name, xlab = "")
  points(seq(min(selected_trans), max(selected_trans), length.out = 500),
       dnorm(seq(min(selected_trans), max(selected_trans), length.out = 500),
             mean(selected_trans),sd(selected_trans)), type = "l", col = "red")
  
  test_statistic <- ks.test(selected_trans, "pnorm", mean=mean(selected_trans), sd=sd(selected_trans))$statistic
  critical_value <- 1.3581 / sqrt (length(selected_trans))
  
  if (test_statistic > critical_value) {
message(paste("Transformed ", column_name , " is not approximately normally distributed.", test_statistic, critical_value))
} else {
message(paste("Transformed ", column_name , " is approximately normally distributed!", test_statistic, critical_value))  
}}
## Transformed  Artist_Follower  is approximately normally distributed! 0.0485813205231013 0.0970071428571429
## Warning in ks.test(selected_trans, "pnorm", mean = mean(selected_trans), : ties
## should not be present for the Kolmogorov-Smirnov test
## Transformed  Artist_Popularity  is approximately normally distributed! 0.047962859540898 0.0970071428571429
## Warning in ks.test(selected_trans, "pnorm", mean = mean(selected_trans), : ties
## should not be present for the Kolmogorov-Smirnov test
## Transformed  Artist_Singles_Number  is approximately normally distributed! 0.0639302516161726 0.0970071428571429
## Warning in ks.test(selected_trans, "pnorm", mean = mean(selected_trans), : ties
## should not be present for the Kolmogorov-Smirnov test
## Transformed  Artist_Singles_Tracks_Number  is approximately normally distributed! 0.0334178939616578 0.0970071428571429
## Warning in ks.test(selected_trans, "pnorm", mean = mean(selected_trans), : ties
## should not be present for the Kolmogorov-Smirnov test
## Transformed  Track_Duration_ms  is not approximately normally distributed. 0.0998436682047523 0.0970071428571429
## Warning in ks.test(selected_trans, "pnorm", mean = mean(selected_trans), : ties
## should not be present for the Kolmogorov-Smirnov test
## Transformed  Track_Popularity  is not approximately normally distributed. 0.108864892250361 0.0970071428571429
## Warning in ks.test(selected_trans, "pnorm", mean = mean(selected_trans), : ties
## should not be present for the Kolmogorov-Smirnov test
## Transformed  acousticness  is not approximately normally distributed. 0.108064790247887 0.0970071428571429
## Warning in ks.test(selected_trans, "pnorm", mean = mean(selected_trans), : ties
## should not be present for the Kolmogorov-Smirnov test
## Transformed  danceability  is not approximately normally distributed. 0.113440523942993 0.0970071428571429
## Warning in ks.test(selected_trans, "pnorm", mean = mean(selected_trans), : ties
## should not be present for the Kolmogorov-Smirnov test
## Transformed  energy  is not approximately normally distributed. 0.114310681616891 0.0970071428571429
## Warning in ks.test(selected_trans, "pnorm", mean = mean(selected_trans), : ties
## should not be present for the Kolmogorov-Smirnov test
## Transformed  instrumentalness  is not approximately normally distributed. 0.285896259482658 0.0970071428571429
## Warning in ks.test(selected_trans, "pnorm", mean = mean(selected_trans), : ties
## should not be present for the Kolmogorov-Smirnov test
## Transformed  liveness  is not approximately normally distributed. 0.103366239896501 0.0970071428571429
## Warning in ks.test(selected_trans, "pnorm", mean = mean(selected_trans), : ties
## should not be present for the Kolmogorov-Smirnov test
## Transformed  loudness  is approximately normally distributed! 0.0958614307678946 0.0970071428571429
## Warning in ks.test(selected_trans, "pnorm", mean = mean(selected_trans), : ties
## should not be present for the Kolmogorov-Smirnov test
## Transformed  speechiness  is approximately normally distributed! 0.0660473903454177 0.0970071428571429
## Warning in ks.test(selected_trans, "pnorm", mean = mean(selected_trans), : ties
## should not be present for the Kolmogorov-Smirnov test
## Transformed  tempo  is approximately normally distributed! 0.0841680160106226 0.0970071428571429
## Warning in ks.test(selected_trans, "pnorm", mean = mean(selected_trans), : ties
## should not be present for the Kolmogorov-Smirnov test
## Transformed  valence  is approximately normally distributed! 0.0879386163770323 0.0970071428571429
## Warning in ks.test(selected_trans, "pnorm", mean = mean(selected_trans), : ties
## should not be present for the Kolmogorov-Smirnov test
## Transformed  likeCount  is approximately normally distributed! 0.0696106433502905 0.0970071428571429
## Transformed  viewsCount  is approximately normally distributed! 0.0770680692211038 0.0970071428571429

par(mar = rep(2, 4))

par(mfrow=c(4,5))

for(trans_index in 1:length(strictly_positive_variables)){
  
  
  column_name <- strictly_positive_variables[trans_index]
  
  selected_trans <- hist_trans_list[trans_index]
  selected_trans <- as.numeric(as.character(unlist(selected_trans[[1]])))
  
  df <- data.frame(selected_trans)
  names(df)[1] <- strictly_positive_variables[trans_index]
  
  boxplot(x = df[,1], main = strictly_positive_variables[trans_index], notch = FALSE)
  
}

Correlation

library(corrplot)
## corrplot 0.84 loaded
method <- "pearson"

clean_cor <- cor(x[,var_list], method = method)

cor.mtest <- function(mat, ...) {
  mat <- as.matrix(mat)
  n <- ncol(mat)
  p.mat<- matrix(NA, n, n)
  diag(p.mat) <- 0
  for (i in 1:(n - 1)) {
    for (j in (i + 1):n) {
      tmp <- cor.test(mat[, i], mat[, j], ..., method = method)
      p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
    }
  }
  colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
  p.mat
}

# matrix of the p-value of the correlation
p.mat <- cor.mtest(clean_cor)

col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
significance_level <- 0.05

corrplot(clean_cor, method="color", col=col(200),
         type="upper", order="alphabet", 
         addCoef.col = "black", # Add coefficient of correlation
         tl.col="black", tl.srt=90, #Text label color and rotation
         # Combine with significance
         p.mat = p.mat, sig.level = significance_level, insig = "blank", 
         # hide correlation coefficient on the principal diagonal
         diag=FALSE)

Steiger’s Z test

#

lm1 <- lm(x$Artist_Popularity ~ x$Artist_Follower)
summary(lm1)
## 
## Call:
## lm(formula = x$Artist_Popularity ~ x$Artist_Follower)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.796  -9.761   2.344  10.590  27.413 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       6.268e+01  1.029e+00  60.900  < 2e-16 ***
## x$Artist_Follower 1.216e-06  1.491e-07   8.154  4.3e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.72 on 194 degrees of freedom
## Multiple R-squared:  0.2552, Adjusted R-squared:  0.2514 
## F-statistic: 66.49 on 1 and 194 DF,  p-value: 4.298e-14
lm2 <- lm(Artist_Popularity_trans ~ Artist_Follower_trans)
summary(lm2)
## 
## Call:
## lm(formula = Artist_Popularity_trans ~ Artist_Follower_trans)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -92.767 -23.106  -7.232  23.326 121.213 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -108.514     12.039  -9.014   <2e-16 ***
## Artist_Follower_trans   31.226      1.017  30.713   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34.89 on 194 degrees of freedom
## Multiple R-squared:  0.8294, Adjusted R-squared:  0.8285 
## F-statistic: 943.3 on 1 and 194 DF,  p-value: < 2.2e-16
par(mfrow=c(1,2))
plot(x$Artist_Follower, x$Artist_Popularity, main = paste("Untransformed, R^2=", round(summary(lm1)$r.squared, digits =2)), xlab = "Artist_Follower", ylab = "Artist_Popularity")
plot(Artist_Follower_trans, Artist_Popularity_trans, main = paste("Box-Cox transformed, R^2=", round(summary(lm2)$r.squared, digits =2)), xlab = "Artist_Follower", ylab = "Artist_Popularity")

bivariate_df_base <- data.frame(x$Artist_Follower, x$Artist_Popularity)
bivariate_df_bc <- data.frame(Artist_Follower_trans, Artist_Popularity_trans)

library("MVN")
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## sROC 0.1-2 loaded
MVN::mvn(bivariate_df_base, mvnTest = "mardia")$multivariateNormality # Not jointly normal
MVN::mvn(bivariate_df_base, mvnTest = "hz")$multivariateNormality # Not jointly normal
MVN::mvn(bivariate_df_base, mvnTest = "royston")$multivariateNormality # Not jointly normal
MVN::mvn(bivariate_df_base, mvnTest = "energy")$multivariateNormality # Jointly normal
MVN::mvn(bivariate_df_bc, mvnTest = "mardia")$multivariateNormality # Not jointly normal
MVN::mvn(bivariate_df_bc, mvnTest = "hz")$multivariateNormality # Not jointly normal
MVN::mvn(bivariate_df_bc, mvnTest = "royston")$multivariateNormality # Not jointly normal
MVN::mvn(bivariate_df_bc, mvnTest = "energy")$multivariateNormality # Jointly normal
boxcox_df <- data.frame(matrix(vector(), nrow = 196, ncol = 17))
boxcox_df$Artist_Follower <- Artist_Follower_trans
boxcox_df$Artist_Popularity <- Artist_Popularity_trans
boxcox_df$Artist_Singles_Number <- Artist_Singles_Number_trans
boxcox_df$Artist_Singles_Tracks_Number <- Artist_Singles_Tracks_Number_trans
boxcox_df$Track_Duration_ms <- Track_Duration_ms_trans
boxcox_df$Track_Popularity <- Track_Popularity_trans
boxcox_df$acousticness <- acousticness_trans
boxcox_df$danceability <- danceability_trans
boxcox_df$energy <- energy_trans
boxcox_df$instrumentalness <- instrumentalness_trans
boxcox_df$liveness <- liveness_trans
boxcox_df$loudness <- loudness_trans
boxcox_df$speechiness <- speechiness_trans
boxcox_df$tempo <- tempo_trans
boxcox_df$valence <- valence_trans
boxcox_df$likeCount <- likeCount_trans
boxcox_df$viewsCount <- viewsCount_trans

drop.cols <- c('X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17')

boxcox_df <- dplyr::select(boxcox_df, -one_of(drop.cols))

par(mfrow=c(1,1))

palette <- c('red', 'blue', 'black', 'purple')
# palette <- rainbow(length(levels(as.factor(x$Genre))))
my_colors <- palette[as.numeric(x$Genre)]

plot(Artist_Follower_trans, Artist_Popularity_trans, main = paste("Box-Cox transformed, R^2=", round(summary(lm2)$r.squared, digits =2)), xlab = "Artist_Follower", ylab = "Artist_Popularity", col = my_colors, pch = 16)
legend("bottomright", legend = levels(as.factor(x$Genre)), col = palette, pch = 16)

library("lattice")
xyplot(Artist_Popularity_trans~Artist_Follower_trans|x$Genre, pch=19, xlab = 'viewsCount (transformed)', ylab = 'Artist_Popularity (transformed)', main = "Biplot")

# pairs(boxcox_df)

sunflower_Artist_pop <- 2*round(boxcox_df$Artist_Popularity/2)
sunflower_viewsCount  <- 2*round(boxcox_df$viewsCount/2)
sunflowerplot(sunflower_Artist_pop~sunflower_viewsCount, xlab = 'viewsCount (transformed)', ylab = 'Artist_Popularity (transformed)', main = 'Sunflower plot')

require("MASS")
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
parcoord(boxcox_df, var.label = FALSE)

#library("lattice")
#parallelplot(boxcox_df, horizontal.axis=T)

#library("ggplot2")
#library("GGally")
#ggparcoord(boxcox_df) + geom_line()

library("andrews")
andrews(boxcox_df, ymax=7)

audio_features <- c('acousticness' , 'danceability' ,'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence')

selected_columns <- dplyr::select(boxcox_df, audio_features)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(audio_features)` instead of `audio_features` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
parcoord(selected_columns , col= my_colors, var.label = FALSE, ... = par(las=2, mar = c(8,1,3,1), xpd=TRUE))
## Warning in plot.window(...): "..." ist kein Grafikparameter
## Warning in plot.xy(xy, type, ...): "..." ist kein Grafikparameter
## Warning in title(...): "..." ist kein Grafikparameter
# title("Parallel coordinates", line = 0.6, adj = 0.1)
legend("topright", inset=c(0.035,-0.08),legend = c('Classic', 'Techno', 'Pop', 'Hip Hop'), col = unique(my_colors),
  bty = 'o', horiz = TRUE, pch = 15)

ax <- selected_columns
ax$clr <- x$Genre
andrews(ax, ymax=4, clr=10)
legend("topright",legend=levels(ax$clr), fill=rainbow(length(levels(as.factor(ax$clr)))))

library("scagnostics")
## Loading required package: rJava
sub_df <- as.data.frame(x[, var_list])

s <- scagnostics(sub_df)
o <- scagnosticsOutliers(s)
g <- scagnosticsGrid(s)

go <- g[o,]

# Outliers: strongly different from the other plots

par(mfrow=c(1,1))

for (i in 1:nrow(go)){
plot(sub_df[[go[i,1]]], sub_df[[go[i,2]]], pch=19, xlab=names(sub_df)[go[i,1]], ylab=names(sub_df)[go[i,2]])
print(paste("Outlying pair: ", "y=", names(sub_df)[go[i,2]], "x=", names(sub_df)[go[i,1]]))}

## [1] "Outlying pair:  y= Artist_Compilations_Number x= Artist_Albums_Number"

## [1] "Outlying pair:  y= Artist_Compilations_Number x= Artist_Albums_Tracks_Number"

## [1] "Outlying pair:  y= Artist_Compilations_Tracks_Number x= Artist_Albums_Number"

## [1] "Outlying pair:  y= Artist_Compilations_Tracks_Number x= Artist_Compilations_Number"

## [1] "Outlying pair:  y= Artist_Popularity x= Artist_Follower"

## [1] "Outlying pair:  y= Artist_Singles_Number x= Artist_Albums_Number"

## [1] "Outlying pair:  y= Artist_Singles_Number x= Artist_Compilations_Tracks_Number"

## [1] "Outlying pair:  y= Artist_Singles_Tracks_Number x= Artist_Compilations_Tracks_Number"

## [1] "Outlying pair:  y= Artist_Singles_Tracks_Number x= Artist_Singles_Number"

## [1] "Outlying pair:  y= instrumentalness x= Track_Duration_ms"

## [1] "Outlying pair:  y= loudness x= Artist_Albums_Tracks_Number"

## [1] "Outlying pair:  y= valence x= instrumentalness"

## [1] "Outlying pair:  y= dislikeCount x= Artist_Follower"

## [1] "Outlying pair:  y= likeCount x= Artist_Follower"

## [1] "Outlying pair:  y= likeCount x= commentCount"
# Highlights:

par(mfrow=c(1,2))
plot(x$Track_Duration_ms, x$instrumentalness, col = my_colors, pch = 16, xlab = 'Track_Duration_ms', ylab = 'instrumentalness')
plot(x$instrumentalness, x$valence, col = my_colors, pch = 16,  xlab = 'instrumentalness', ylab = 'valence')
legend(x=-1300, y=1250, legend = levels(as.factor(x$Genre)), col = palette, pch = 16, horiz = TRUE, xpd = NA)

# Exemplars: group of similar plots

e <- scagnosticsExemplars(s)

ge <- g[e,]

par(mfrow = c(2,2))

for (i in 1:dim(ge)[1]){
  plot(sub_df[[ge$x[i]]], sub_df[[ge$y[i]]], pch=19,
  xlab=names(sub_df)[ge$x[i]], ylab=names(sub_df)[ge$y[i]])
  print(paste("Exemplary pair: ", "y=", names(sub_df)[ge$y[i]], "x=", names(sub_df)[ge$x[i]]))
}
## [1] "Exemplary pair:  y= Track_Popularity x= Artist_Appearances_Tracks_Number"
## [1] "Exemplary pair:  y= energy x= Artist_Albums_Tracks_Number"
## [1] "Exemplary pair:  y= likeCount x= Artist_Appearances_Number"

## [1] "Exemplary pair:  y= viewsCount x= tempo"
table(x$Genre)
## 
## Classic Hip Hop     Pop  Techno 
##      49      50      50      47
x$Artist_Popularity_quantile <- 0

Artist_Popularity_quantiles <- quantile(x$Artist_Popularity, probs = c(0.25, 0.5, 0.75))

Artist_Pop_q_25 <- Artist_Popularity_quantiles[1]
Artist_Pop_median <- Artist_Popularity_quantiles[2]
Artist_Pop_q_75 <- Artist_Popularity_quantiles[3]

x$Artist_Popularity_quantile <- ifelse(x$Artist_Popularity < Artist_Pop_q_25, 1, x$Artist_Popularity_quantile + 0)

x$Artist_Popularity_quantile <- ifelse(((x$Artist_Popularity >= Artist_Pop_q_25) & (x$Artist_Popularity < Artist_Pop_median)), 2, x$Artist_Popularity_quantile + 0)

x$Artist_Popularity_quantile <- ifelse(((x$Artist_Popularity >= Artist_Pop_median) & (x$Artist_Popularity < Artist_Pop_q_75)), 3, x$Artist_Popularity_quantile + 0)

x$Artist_Popularity_quantile <- ifelse((x$Artist_Popularity >= Artist_Pop_q_75), 4, x$Artist_Popularity_quantile + 0)
x$Artist_Popularity_quantile <- as.factor(x$Artist_Popularity_quantile)

tab<-table(x$Genre, x$Artist_Popularity_quantile)
tab
##          
##            1  2  3  4
##   Classic  9 21 13  6
##   Hip Hop  0  8 25 17
##   Pop      1  8 10 31
##   Techno  37 10  0  0
# r = 4, c = 4
#df = (r-1)*(c-1) = 3+3 = 9
# critical value: 16.919


chisq.test(tab)
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 156.23, df = 9, p-value < 2.2e-16
chisq.test(tab, simulate.p.value = TRUE)
## 
##  Pearson's Chi-squared test with simulated p-value (based on 2000
##  replicates)
## 
## data:  tab
## X-squared = 156.23, df = NA, p-value = 0.0004998
library("vcd")
## Loading required package: grid
assocstats(tab)
##                     X^2 df P(> X^2)
## Likelihood Ratio 168.67  9        0
## Pearson          156.23  9        0
## 
## Phi-Coefficient   : NA 
## Contingency Coeff.: 0.666 
## Cramer's V        : 0.515
#library("openintro")

#contTable(tab)
x$viewsCount_quantile <- 0

viewsCount_quantiles <- quantile(x$viewsCount, probs = c(0.25, 0.5, 0.75))

viewsCount_q_25 <- viewsCount_quantiles[1]
viewsCount_median <- viewsCount_quantiles[2]
viewsCount_q_75 <- viewsCount_quantiles[3]

x$viewsCount_quantile <- ifelse(x$viewsCount < viewsCount_q_25, 1, x$viewsCount_quantile + 0)

x$viewsCount_quantile <- ifelse(((x$viewsCount >= viewsCount_q_25) & (x$viewsCount < viewsCount_median)), 2, x$viewsCount_quantile + 0)

x$viewsCount_quantile <- ifelse(((x$viewsCount >= viewsCount_median) & (x$viewsCount < viewsCount_q_75)), 3, x$viewsCount_quantile + 0)

x$viewsCount_quantile <- ifelse((x$viewsCount >= viewsCount_q_75), 4, x$viewsCount_quantile + 0)
x$viewsCount_quantile <- as.factor(x$viewsCount_quantile)


tab<-table(x$viewsCount_quantile, x$Artist_Popularity_quantile)
tab
##    
##      1  2  3  4
##   1 36  6  4  3
##   2 10 19 12  8
##   3  1 18 19 11
##   4  0  4 13 32
chisq.test(tab)
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 133.34, df = 9, p-value < 2.2e-16
chisq.test(tab, simulate.p.value = TRUE)
## 
##  Pearson's Chi-squared test with simulated p-value (based on 2000
##  replicates)
## 
## data:  tab
## X-squared = 133.34, df = NA, p-value = 0.0004998
library("vcd")
assocstats(tab)
##                     X^2 df P(> X^2)
## Likelihood Ratio 133.48  9        0
## Pearson          133.34  9        0
## 
## Phi-Coefficient   : NA 
## Contingency Coeff.: 0.636 
## Cramer's V        : 0.476
#contTable(tab)
ab2 <- na.omit(cbind(x$Artist_Popularity_quantile, x$viewsCount_quantile))
nrow(ab2)*(nrow(ab2)-1)/2
## [1] 19110
#
ind <- order(ab2[,1], ab2[,2])
ab2 <- ab2[ind,]
#b
C <- D <- Tx <- Ty <- Txy <- 0
for (i in 1:(nrow(ab2)-1)) {
  if (i%%100==0) cat(i, "\n")
  for(j in (i+1):nrow(ab2)) {
    if (ab2[i,1]==ab2[j,1]) {
      if (ab2[i,2]==ab2[j,2]) {
        Txy <- Txy+1
      } else {
        Tx <- Tx+(ab2[i,2]<ab2[j,2])
      }
    } else {   
      if (ab2[i,2]==ab2[j,2]) Ty <- Ty+1
      if (ab2[i,2]<ab2[j,2]) C <- C+1
      if (ab2[i,2]>ab2[j,2]) D <- D+1
    }
  }
}
## 100
c(C=C, D=D, Tx=Tx, Ty=Ty, Txy=Txy)
##     C     D    Tx    Ty   Txy 
## 10048  1560  2798  2781  1923
k_t <- (C - D)/(nrow(ab2)*(nrow(ab2)-1)/2)
k_t # (without ties)
## [1] 0.4441654
library("ryouready")
ord.tau(table(ab2[,1], ab2[,2]))
## Kendall's (and Stuart's) Tau statistics
##  Tau-b: 0.590
##  Tau-c: 0.589
cor(as.numeric(x$Artist_Popularity_quantile), as.numeric(x$viewsCount_quantile), method = "kendall") # identical result
## [1] 0.5895469
cor.test(as.numeric(x$Artist_Popularity_quantile), as.numeric(x$viewsCount_quantile), method="kendall")
## 
##  Kendall's rank correlation tau
## 
## data:  as.numeric(x$Artist_Popularity_quantile) and as.numeric(x$viewsCount_quantile)
## z = 9.8748, p-value < 2.2e-16
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
##       tau 
## 0.5895469
modetab <- function(x, margin=0) { 
  if (margin>0) apply(x, margin, max) else max(x) 
}

tab2 <- table(x$Artist_Popularity_quantile, 
              x$Genre, dnn = c("Artist Popularity quantile", "Genre"))

tab2
##                           Genre
## Artist Popularity quantile Classic Hip Hop Pop Techno
##                          1       9       0   1     37
##                          2      21       8   8     10
##                          3      13      25  10      0
##                          4       6      17  31      0
# contTable(tab2)


tab <- rowSums(tab2)

message(paste("The mode across all four quantiles of stream distribution is", modetab(tab), " and is in the 4th quantile. So the best prediction for a new artist without further knowledge is that she will fall in the top quantile of the popularity distribution."))
## The mode across all four quantiles of stream distribution is 54  and is in the 4th quantile. So the best prediction for a new artist without further knowledge is that she will fall in the top quantile of the popularity distribution.
e1_streams <- sum(tab) - modetab(tab)

message(paste("Without any knowledge about an additional feature other than artist popularity quantile itself and using the mode across classes as best predictor for class assignment for a new observation, the number of falsely predicted cases would be: ", e1_streams, " or ", e1_streams/sum(tab)*100, "%."))
## Without any knowledge about an additional feature other than artist popularity quantile itself and using the mode across classes as best predictor for class assignment for a new observation, the number of falsely predicted cases would be:  142  or  72.4489795918367 %.
e2_streams <- sum(tab2) - sum(modetab(tab2, 2))

message(paste("Now having knowledge about an association between artist popularity and its genre and using class-internal modes for the additional feature, the number of falsely predicted cases would be: ", e2_streams, " or ", e2_streams/sum(tab)*100, "%, which is already much lower compared to the error rate from predicting without any knowledge about an association. On the other hand, ", sum(tab2)-e2_streams, " or ", (sum(tab2)-e2_streams)/sum(tab2)*100, "% would have been predicted correctly (compared to ", (1-e1_streams/sum(tab))*100, "% if there was no knowledge about genre.)"))
## Now having knowledge about an association between artist popularity and its genre and using class-internal modes for the additional feature, the number of falsely predicted cases would be:  82  or  41.8367346938776 %, which is already much lower compared to the error rate from predicting without any knowledge about an association. On the other hand,  114  or  58.1632653061224 % would have been predicted correctly (compared to  27.5510204081633 % if there was no knowledge about genre.)
lambda <- (e1_streams - e2_streams)/e1_streams

message(paste("Therefore, Goodmann and Kruskals Lambda is: ", lambda))
## Therefore, Goodmann and Kruskals Lambda is:  0.422535211267606
library("ryouready")

nom.lambda(tab2)
## Lambda:
##  Columns dependent: 0.438 
##  Rows dependent: 0.423 
##  Symmetric: 0.431
modetab <- function(x, margin=0) { 
  if (margin>0) apply(x, margin, max) else max(x) 
}

tab2 <- table(x$Artist_Popularity_quantile, 
              x$viewsCount_quantile, dnn = c("Artist Popularity quantile", "viewsCount quantile"))

tab2
##                           viewsCount quantile
## Artist Popularity quantile  1  2  3  4
##                          1 36 10  1  0
##                          2  6 19 18  4
##                          3  4 12 19 13
##                          4  3  8 11 32
tab <- rowSums(tab2)

message(paste("The mode across all four quantiles of stream distribution is", modetab(tab), " and is in the 4th quantile. So the best prediction for a new artist without further knowledge is that she will fall in the top quantile of the popularity distribution."))
## The mode across all four quantiles of stream distribution is 54  and is in the 4th quantile. So the best prediction for a new artist without further knowledge is that she will fall in the top quantile of the popularity distribution.
e1_streams <- sum(tab) - modetab(tab)

message(paste("Without any knowledge about an additional feature other than artist popularity quantile itself and using the mode across classes as best predictor for class assignment for a new observation, the number of falsely predicted cases would be: ", e1_streams, " or ", e1_streams/sum(tab)*100, "%."))
## Without any knowledge about an additional feature other than artist popularity quantile itself and using the mode across classes as best predictor for class assignment for a new observation, the number of falsely predicted cases would be:  142  or  72.4489795918367 %.
e2_streams <- sum(tab2) - sum(modetab(tab2, 2))

message(paste("Now having knowledge about an association between artist popularity and its viewsCount quantile and using class-internal modes for the additional feature, the number of falsely predicted cases would be: ", e2_streams, " or ", e2_streams/sum(tab)*100, "%, which is already much lower compared to the error rate from predicting without any knowledge about an association. On the other hand, ", sum(tab2)-e2_streams, " or ", (sum(tab2)-e2_streams)/sum(tab2)*100, "% would have been predicted correctly (compared to ", (1-e1_streams/sum(tab))*100, "% if there was no knowledge about viewsCount quantile.)"))
## Now having knowledge about an association between artist popularity and its viewsCount quantile and using class-internal modes for the additional feature, the number of falsely predicted cases would be:  90  or  45.9183673469388 %, which is already much lower compared to the error rate from predicting without any knowledge about an association. On the other hand,  106  or  54.0816326530612 % would have been predicted correctly (compared to  27.5510204081633 % if there was no knowledge about viewsCount quantile.)
lambda <- (e1_streams - e2_streams)/e1_streams

message(paste("Therefore, Goodmann and Kruskals Lambda is: ", lambda))
## Therefore, Goodmann and Kruskals Lambda is:  0.366197183098592
library("ryouready")

nom.lambda(tab2)
## Lambda:
##  Columns dependent: 0.388 
##  Rows dependent: 0.366 
##  Symmetric: 0.377

ANOVA

fit <- aov(Artist_Popularity~Genre , data =x)
summary(fit)
##              Df Sum Sq Mean Sq F value Pr(>F)    
## Genre         3  31032   10344   110.3 <2e-16 ***
## Residuals   192  18006      94                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ks.test(x$Artist_Popularity[x$Genre == 'Classic'], "pnorm", mean = mean (x$Artist_Popularity[x$Genre == 'Classic']) , sd=sd(x$Artist_Popularity[x$Genre == 'Classic']))$statistic
## Warning in ks.test(x$Artist_Popularity[x$Genre == "Classic"], "pnorm", mean
## = mean(x$Artist_Popularity[x$Genre == : ties should not be present for the
## Kolmogorov-Smirnov test
##          D 
## 0.06209392
1.3581 / sqrt (length(x$Artist_Popularity[x$Genre == 'Classic']))
## [1] 0.1940143
ks.test(x$Artist_Popularity[x$Genre == 'Techno'], "pnorm", mean = mean (x$Artist_Popularity[x$Genre == 'Techno']) , sd=sd(x$Artist_Popularity[x$Genre == 'Techno']))$statistic
## Warning in ks.test(x$Artist_Popularity[x$Genre == "Techno"], "pnorm", mean
## = mean(x$Artist_Popularity[x$Genre == : ties should not be present for the
## Kolmogorov-Smirnov test
##         D 
## 0.1108916
1.3581 / sqrt (length(x$Artist_Popularity[x$Genre == 'Techno']))
## [1] 0.1980992
ks.test(x$Artist_Popularity[x$Genre == 'Hip Hop'], "pnorm", mean = mean (x$Artist_Popularity[x$Genre == 'Hip Hop']) , sd=sd(x$Artist_Popularity[x$Genre == 'Hip Hop']))$statistic
## Warning in ks.test(x$Artist_Popularity[x$Genre == "Hip Hop"], "pnorm", mean
## = mean(x$Artist_Popularity[x$Genre == : ties should not be present for the
## Kolmogorov-Smirnov test
##          D 
## 0.09990045
1.3581 / sqrt (length(x$Artist_Popularity[x$Genre == 'Hip Hop']))
## [1] 0.1920643
ks.test(x$Artist_Popularity[x$Genre == 'Pop'], "pnorm", mean = mean (x$Artist_Popularity[x$Genre == 'Pop']) , sd=sd(x$Artist_Popularity[x$Genre == 'Pop']))$statistic
## Warning in ks.test(x$Artist_Popularity[x$Genre == "Pop"], "pnorm", mean =
## mean(x$Artist_Popularity[x$Genre == : ties should not be present for the
## Kolmogorov-Smirnov test
##         D 
## 0.1191381
1.3581 / sqrt (length(x$Artist_Popularity[x$Genre == 'Pop']))
## [1] 0.1920643
leveneTest (Artist_Popularity~Genre, data =x)
varq <- tapply(x$Artist_Popularity, x$Genre, var, na.rm= TRUE)
varq/min(varq)
##  Classic  Hip Hop      Pop   Techno 
## 1.522737 1.000000 2.293584 1.117978

PCA

audio_features <- c('acousticness' , 'danceability' ,'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence')
x_features <- dplyr::select(x, audio_features)
z <- scale(x_features)
eigen_cor <- eigen(cov(z))
E <- eigen_cor$vectors
pc_cor <- prcomp(z, center = F, scale = F)
plot(pc_cor, main="Scree plot as bar chart (cor)")

plot(pc_cor$sdev^2, type="b", main="Scree plot (cor)")

pc_index <- c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9")
eigen_values <- round(eigen_cor$values, digits = 2)
variance_explained <- round(eigen_cor$values/length(audio_features), digits = 2)
cum_variance <- round(cumsum(eigen_cor$values)/length(audio_features), digits = 2)
eigen_df <- data.frame(pc_index, eigen_values, variance_explained, cum_variance)
colnames(eigen_df) <- c("PC", "Eigen value", "Variance explained", "Cumulative variance explained")




as.data.frame(E)
E[,1] <- E[,1]*-1
E[,3] <- E[,3]*-1
E[,4] <- E[,4]*-1
E[,5] <- E[,5]*-1
E[,6] <- E[,6]*-1
as.data.frame(E)
scores <- z %*% E

library(corrplot)

col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))

#par(mfrow=c(1,1))
#plot((pc_cor$sdev^2)/round(sum(pc_cor$sdev^2))*100, type="b", xlab = "Number of components", ylab = "Percentage of variance explained")

corrplot(pc_cor$rotation, method="color", col=col(200), 
        addCoef.col = "black", # Add coefficient of correlation
        tl.col="black", tl.srt=90, #Text label color and rotation
         diag=TRUE, mar = c(0,0,5,5), title = 'PCA - loadings matrix')

plot(scores[,1], scores[,2])

plot(pc_cor$x[,1], pc_cor$x[,2]) # both plots are identical

library("psych")
psych_pca <- psych::principal(z, nfactors = 9, rotate = "none")
scree(z)

# psych_pca$scores[,1]*sqrt(psych_pca$values[1]) # == scores[,1]

# transform loadings of psych::principal to unit-length eigenvectors according to 1/sqrt(a^2 + b^2) * (a,b)
# 1/sqrt(sum(psych_pca$loadings[,1]^2))*psych_pca$loadings[,1] # is identical to pc_cor$rotation[,1] and E[,1]

# transforming unit-length eigenvectors to loadings like in psych::principal:

manual_loadings <- matrix(nrow = dim(z)[2], ncol = dim(z)[2])

for (i in 1:dim(z)[2]){
  
  manual_loadings[,i] <- E[,i] * sqrt(eigen_cor$values[i])
  
}

# Still signs are interchanged, affected components are 3 and 9, change signs manually

manual_loadings[,3] <- manual_loadings[,3]*-1
manual_loadings[,9] <- manual_loadings[,9]*-1

# Scores in psych are computed according to 

(z %*% E[,1]) / sqrt(eigen_cor$values[1]) #  == psych_pca$scores[,1]
##            [,1]
## 1   -1.59390372
## 2   -1.89244014
## 3   -1.56727583
## 4   -2.02403816
## 5   -1.34328173
## 6   -1.98652322
## 7   -1.65063718
## 8    0.23454238
## 9    0.33189660
## 10  -0.50527012
## 11   0.48474383
## 12   0.35548126
## 13   0.40438968
## 14   0.41389912
## 15   0.88932839
## 16   0.29545887
## 17   0.74966760
## 18  -0.33006285
## 19   0.42233800
## 20   0.60508283
## 21   0.35531678
## 22   1.08286743
## 23   0.33772362
## 24   0.39438136
## 25   0.38074682
## 26   0.53275627
## 27   0.35164160
## 28   0.89806447
## 29   0.34945558
## 30   0.79905861
## 31   0.64973543
## 32   0.79940254
## 33  -1.28729950
## 34   0.32285315
## 35   0.28938500
## 36   0.59057212
## 37  -0.91856390
## 38   0.54122847
## 39   0.77222028
## 40  -0.08719343
## 41   0.53776316
## 42   0.48802436
## 43   0.92409950
## 44   0.32211784
## 45   0.35080523
## 46   0.50370165
## 47   0.43417619
## 48   0.65562461
## 49   0.82640247
## 50   0.22221084
## 51   1.23872589
## 52  -0.27480596
## 53  -0.60837106
## 54   1.09983941
## 55   1.07217376
## 56   1.01232351
## 57   0.62184771
## 58   0.84979050
## 59   0.80091618
## 60   0.85471565
## 61   0.88031931
## 62   1.25090015
## 63   0.89078171
## 64   0.76487318
## 65   0.76159508
## 66   0.30109461
## 67   0.68030322
## 68   0.80407226
## 69   0.72002543
## 70   1.15107907
## 71   0.40045381
## 72   1.02252137
## 73   0.48945386
## 74   0.89961735
## 75   0.58542734
## 76   0.99733632
## 77   0.79627998
## 78   0.98885771
## 79   1.28630203
## 80   0.46651096
## 81   1.33907715
## 82   0.64998223
## 83   0.55201140
## 84   0.75942752
## 85   0.53261883
## 86   0.74383810
## 87   0.74165546
## 88   0.74866137
## 89   0.96277142
## 90   0.62715735
## 91   0.64793200
## 92   0.95334623
## 93   0.44942747
## 94   0.69025775
## 95   0.97104873
## 96   1.18362619
## 97   0.59019519
## 98  -0.39117191
## 99   0.40327243
## 100  0.99054825
## 101  0.86270002
## 102  0.67556803
## 103  1.01390165
## 104  1.36233335
## 105  0.81443559
## 106  1.12486562
## 107  0.70258335
## 108  0.10632817
## 109  0.99193014
## 110  0.75504621
## 111  0.65676336
## 112  0.33047902
## 113  0.95870755
## 114  0.75603911
## 115 -2.11696526
## 116 -1.74364531
## 117 -1.53213316
## 118 -1.94291687
## 119 -1.72239513
## 120 -1.36619922
## 121 -1.66305915
## 122 -2.06411957
## 123 -1.97969420
## 124 -1.73865879
## 125 -1.59092573
## 126 -1.75704042
## 127 -1.65120933
## 128 -0.90738230
## 129 -1.57915022
## 130 -1.79710843
## 131 -0.72000536
## 132 -1.11982385
## 133 -1.16126131
## 134 -1.26343254
## 135 -1.59056965
## 136 -1.19417247
## 137 -1.92762287
## 138 -1.75542132
## 139 -2.11166874
## 140 -1.75851617
## 141 -1.30171171
## 142 -1.64499033
## 143 -0.70384029
## 145 -2.22479894
## 146 -0.43936157
## 147 -1.98892924
## 148 -2.10319657
## 149  0.35090994
## 150  0.35960014
## 152  0.27783403
## 153  0.23520099
## 154  0.42816581
## 155  0.29415102
## 156  0.28883891
## 157  0.28901016
## 158  0.32561591
## 159  0.33618271
## 160  0.53796886
## 161  0.15721183
## 162  0.48276945
## 163  0.18837789
## 165  0.38861944
## 166  0.15693865
## 167  0.34641160
## 168  0.46655158
## 169  0.33666927
## 170  0.23963328
## 171  0.37607131
## 172  0.51586048
## 173 -0.01126555
## 174  0.40426506
## 175  0.23144730
## 176  0.40281787
## 177  0.20758467
## 178  0.24328905
## 179  0.06504826
## 180  0.25132125
## 181  0.18718854
## 183  0.39566848
## 184  0.49874147
## 185  0.39761257
## 186  0.59112779
## 187  0.52223745
## 188  0.34826820
## 189  0.17079406
## 190  0.22460558
## 191  0.18252664
## 192 -1.69078006
## 193 -1.32535523
## 194 -1.35844177
## 195 -1.97935120
## 196 -1.43855432
## 197 -1.63893155
## 198 -0.83876042
## 199 -1.32071392
## 200 -1.71405630
# i.e. using the unit-length eigenvectors and dividing them by the square root of the corresponding eigenvalue

par(mfrow=c(1,2))
biplot(pc_cor) # loadings plot with loadings scaled to fit in same graph with scores
plot(pc_cor$rotation[, 1:2]) # loadings in original measures
text(pc_cor$rotation[, 1:2], as.character(colnames(z)), pos = 1, cex = 0.7, offset = 0.1)

#palette <- rainbow(length(levels(as.factor(x$Genre))))
#my_colors <- palette[as.numeric(x$Genre)]

palette <- c('red', 'blue', 'black', 'purple')
# palette <- rainbow(length(levels(as.factor(x$Genre))))
my_colors <- palette[as.numeric(x$Genre)]

par(mfrow=c(1,1))
plot(scores[, 1:2], col = my_colors, pch = 19, xlab = "Scores PC1 (47.84% expl. variance)", ylab = "Scores PC2 (16.25% expl. variance)")
legend("topleft", legend = levels(as.factor(x$Genre)), col = palette, pch = 16, horiz = T)

#plot(pc_cor$rotation[, 1:2])
par(mfrow=c(1,1))
plot(E[, 1:2])
text(E[, 1:2], as.character(colnames(z)), pos = 1, cex = 0.7, offset = 0.1)

cumsum(eigen_cor$values)/length(audio_features)
## [1] 0.4783993 0.6408610 0.7496500 0.8399854 0.9171536 0.9507038 0.9754113
## [8] 0.9910233 1.0000000
# According to the 90% criterion there would be 5 components needed to appropriately retain the variance in the data by reducing the feature space from nine to five.

RMSE = function(m, o){
  
  sqrt(mean((m - o)^2))
  
}

# All expressions for the scores are identical

scores <- z %*% E
scores_from_psych <- (z %*% (1/sqrt(sum(psych_pca$loadings[,1]^2))*psych_pca$loadings[,1])) / sqrt(psych_pca$values[1]) # == psych_pca$scores[,1]
scores_from_princomp <- (z %*% pc_cor$rotation[,1]) / sqrt(eigen_cor$values[1])

fits <- matrix(nrow = dim(z)[2], ncol = 3)

for (i in 1:dim(z)[2]){
  
  num_comp <- i
  fit <- scores[, 1:num_comp] %*% t(E[, 1:num_comp])
  residuals <- z - fit
  rmse <- RMSE(z, fit)
  r2 <- 1-(sum(diag(var(residuals)))/ sum(diag(var(z))))
  
  fits[i,1] <- num_comp
  fits[i,2] <- rmse
  fits[i,3] <- r2
  
}

par(mfrow=c(1,1))
plot(fits[,1], fits[,2], type = "l", ylab = 'RMSE', xlab = 'Number of components')
par(new = TRUE)
plot(fits[,1], fits[,3], type = "l", axes = FALSE, bty = "n", xlab = "", ylab = "")
axis(side=4, at = pretty(range(fits[,3])))
mtext("R^2", side = 4, line = -1)
abline(v = 2, lty = 2, col = 'green')
mtext("Kaiser criterion", side = 1, line = 2, adj = 0.1)
abline(v=min(which(fits[,3]>=0.9)), lty = 2, col = 'red')
mtext("90% rule", side = 1, line = 1, adj = 0.5)

# Squared prediction errors / Q-residuals

par(mfrow=c(1,1))

a <- 1
Xhat <- pc_cor$x[,seq(1,a)] %*% t(pc_cor$rotation[,seq(1,a)])
RMSE(z, Xhat)
## [1] 0.7203745
res <- z - Xhat
E2 <- res * res
SPE_1 <- apply(E2, 1, sum) # not using sqrt(apply(E2, 1, sum))!

# Box (1954) method: weighted Chi-squared

v <- var(SPE_1)
m <- mean(SPE_1)
h <- (2*m^2)/v
g <- v / (2*m)
lim_k1_95 <- g*qchisq(.95, df=h) 
plot(SPE_1, ylab = paste('SPE after using', a, 'component'), xlab = 'index')
abline(h = lim_k1_95, lty = 2, col = 'darkgreen')
mtext(paste('RMSE =', round(sqrt(mean(SPE_1)), digits = 4)), side=1, line=3.5, at=9)

sqrt(mean(SPE_1))
## [1] 2.161124
a <- 2
Xhat <- pc_cor$x[,seq(1,a)] %*% t(pc_cor$rotation[,seq(1,a)])
RMSE(z, Xhat)
## [1] 0.5977514
res <- z - Xhat
E2 <- res * res
#SPE_2 <- sqrt(apply(E2, 1, sum))
SPE_2 <- apply(E2, 1, sum)
# pca(z, ncomp = a, center = FALSE, scale = FALSE, alpha = 0.05)$Qlim[1]

v <- var(SPE_2)
m <- mean(SPE_2)
h <- (2*m^2)/v
g <- v / (2*m)
lim_k2_95 <- g*qchisq(.95, df=h) 
plot(SPE_2, ylab = paste('SPE after using', a, 'components'), xlab = 'index')
abline(h = lim_k2_95, lty = 2, col = 'darkgreen')
mtext(paste('RMSE =', round(sqrt(mean(SPE_2)), digits = 4)), side=1, line=3.5, at=9)

sqrt(mean(SPE_2))
## [1] 1.793254
a <- 9
Xhat <- pc_cor$x[,seq(1,a)] %*% t(pc_cor$rotation[,seq(1,a)])
RMSE(z, Xhat)
## [1] 9.521473e-16
res <- z - Xhat
E2 <- res * res
# SPE_9 <- sqrt(apply(E2, 1, sum))
SPE_9 <- apply(E2, 1, sum)
# pca(z, ncomp = a, center = FALSE, scale = FALSE, alpha = 0.05)$Qlim[1]

v <- var(SPE_9)
m <- mean(SPE_9)
h <- (2*m^2)/v
g <- v / (2*m)
lim_k9_95 <- g*qchisq(.95, df=h) 
plot(SPE_9, ylab = paste('SPE after using', a, 'components'), xlab = 'index')
abline(h = lim_k9_95, lty = 2, col = 'darkgreen')
mtext(paste('RMSE =', round(sqrt(mean(SPE_9)), digits = 4)), side=1, line=3.5, at=9)

sqrt(mean(SPE_9))
## [1] 2.856442e-15
par(mfrow=c(3,1))

plot(SPE_1, type = 'l', col = 'darkgreen', ylab = "SPE for PC1")
abline(h = lim_k1_95, lty = 2, col = 'darkgreen')
plot(SPE_2, type = 'l', col = 'red', ylab = "SPE for PC1 + PC2")
abline(h = lim_k2_95, lty = 2, col = 'darkgreen')
plot(SPE_9, type = 'l', col = 'blue', ylab = "SPE for all PCs")
abline(h = lim_k9_95, lty = 2, col = 'darkgreen')

sum(SPE_1 > lim_k1_95)
## [1] 6
sum(SPE_1 > lim_k1_95)/length(SPE_1)
## [1] 0.03061224
sum(SPE_2 > lim_k2_95)
## [1] 7
sum(SPE_2 > lim_k2_95)/length(SPE_2)
## [1] 0.03571429
sum(SPE_9 > lim_k9_95)
## [1] 10
sum(SPE_9 > lim_k9_95)/length(SPE_9)
## [1] 0.05102041

Hotelling’s T^2

num_comp <- 2

inverse_cov <- diag(eigen_cor$values[1:num_comp], nrow = num_comp, ncol = num_comp)^-1
inverse_cov[is.infinite(inverse_cov)] <- 0

tsquared <- diag(z %*% E[,1:num_comp] %*% inverse_cov %*% t(z %*% E[,1:num_comp]))

# which is equivalent to the Mahalanobis distance: (for k > 1)

tsquared_mahal <- mahalanobis(scores[,1:num_comp], center = FALSE, cov(cbind(scores[,1], scores[,num_comp])), inverted = FALSE)
tsquared_lim95 <- (((dim(z)[1] - 1) * (dim(z)[1] + 1) * num_comp) / (dim(z)[1] * (dim(z)[1] - num_comp))) * qf(p =.95, df1 = num_comp, df2 = dim(z)[1] - num_comp)
tsquared_lim99 <- (((dim(z)[1] - 1) * (dim(z)[1] + 1) * num_comp) / (dim(z)[1] * (dim(z)[1] - num_comp))) * qf(p =.99, df1 = num_comp, df2 = dim(z)[1] - num_comp)

par(mfrow=c(1,1))

plot(tsquared, type = 'l', ylim = c(0,tsquared_lim99*1.1), ylab = "Hotelling's T2")
abline(h=tsquared_lim99, col = 'red', lty = 2)
text(190,10, "99% limit", col = "red")
abline(h=tsquared_lim95, col = 'darkgreen', lty = 2)
text(190,6.6, "95% limit", col = "darkgreen")

# source: https://github.com/hredestig/pcaMethods/blob/master/R/pca.R

simpleEllipse <- function(x, y, alfa=0.95, len=200) {
  N <- length(x)
  A <- 2
  mypi <- seq(0, 2 * pi, length=len)
  r1 <- sqrt(var(x) * qf(alfa, 2, N - 2) * (2*(N^2 - 1)/(N * (N - 2))))
  r2 <- sqrt(var(y) * qf(alfa, 2, N - 2) * (2*(N^2 - 1)/(N * (N - 2))))
  cbind(r1 * cos(mypi) + mean(x), r2 * sin(mypi) + mean(y))
}


confidence_ellipse95 <- simpleEllipse(scores[,1], scores[,2], alfa = 0.95, len=500)
confidence_ellipse99 <- simpleEllipse(scores[,1], scores[,2], alfa = 0.99, len=500)

plot(scores[,1], scores[,2], xlim = c(min(confidence_ellipse99[,1]), max(confidence_ellipse99[,1])), 
     ylim = c(min(confidence_ellipse99[,2]), max(confidence_ellipse99[,2])), ylab = 'Scores PC2', xlab = 'Scores PC1')
abline(h = 0, v = 0)
points(confidence_ellipse95, type = 'l', lty = 2, col = 'darkgreen')
points(confidence_ellipse99, type = 'l', lty = 2, col = 'red')

x[as.numeric(names(tsquared[tsquared > tsquared_lim95])),c('Track_Title', 'Track_Artist', 'Genre')]
outlier_tracks <- x[as.numeric(names(tsquared[tsquared > tsquared_lim95])),c('Track_Title', 'Track_Artist', 'Genre')]
round(z[as.numeric(names(tsquared[tsquared > tsquared_lim95])),], digits = 2)
##     acousticness danceability energy instrumentalness liveness loudness
## 21          0.88         0.21  -0.51            -0.90    -0.23    -0.04
## 146         0.35         0.49  -0.79             1.26     2.13    -0.20
##     speechiness tempo valence
## 21         4.30 -0.95    1.08
## 146       -0.61 -0.41   -0.68
audio_z <- as.data.frame(z[as.numeric(names(tsquared[tsquared > tsquared_lim95])),])
outlier_tracks$acousticness <- audio_z$acousticness
outlier_tracks$danceability <- audio_z$danceability
outlier_tracks$energy <- audio_z$energy
outlier_tracks$instrumentalness <- audio_z$instrumentalness
outlier_tracks$liveness <- audio_z$liveness
outlier_tracks$loudness <- audio_z$loudness
outlier_tracks$speechiness <- audio_z$speechiness
outlier_tracks$tempo <- audio_z$tempo
outlier_tracks$valence <- audio_z$valence

# Finally, combining residuals and scores (i.e. Hotelling's T^2)

# cumsum(pc_cor$sdev^2 / sum(pc_cor$sdev^2))[num_comp]

plot(tsquared, SPE_2, 
     xlab = paste("Hotelling's T2 (", round(cumsum(pc_cor$sdev^2 / sum(pc_cor$sdev^2))[num_comp]*100, digits = 2), "%)"),
     ylab = paste("Q-residuals (", round((1-cumsum(pc_cor$sdev^2 / sum(pc_cor$sdev^2))[num_comp])*100, digits = 2), "%)"))
abline(h = lim_k2_95, lty = 2, col = 'darkgreen') # seven residual outliers
abline(v = tsquared_lim95, lty = 2, col = 'darkgreen') #  two score outliers

library("robustbase")
out <- adjOutlyingness(z, ndir=5000, clower=0, cupper=0)
hist(out$adjout)
rug(out$adjout)

z[!out$nonOut,]#  as we can confirm, the outlying cases differ strongly in their characteristics from their respective                         centers, that is especially liveness and energy are inconsistent with the model
##     acousticness danceability     energy instrumentalness   liveness   loudness
## 15    -0.0944678    0.3050881  0.4781728       -0.9049362  4.3284375  0.5324131
## 21     0.8823223    0.2129273 -0.5076300       -0.9049323 -0.2300724 -0.0352499
## 51    -0.7253561    0.2925207  1.1679094       -0.9049362  7.1823764  0.8712256
## 96    -0.5613949    1.5073675  0.8197876       -0.9042543  4.8251982  0.5040817
## 140    1.5880799   -1.8187992 -1.6889666        0.5378426  3.1693292 -1.7704512
## 145    1.4190201   -1.7098819 -1.8773428       -0.9048587 -0.3469572 -4.0775846
##     speechiness      tempo    valence
## 15   -0.3911434  1.1156910  1.3435281
## 21    4.3016000 -0.9477023  1.0815371
## 51   -0.5411446 -0.4296090  1.6680842
## 96    1.4994957 -0.0694509  0.1156894
## 140  -0.5848949 -1.9895640 -1.0077438
## 145  -0.6432287 -1.6341112 -1.1700219
x[rownames(z[!out$nonOut,]),c("Track_Title", "Track_Artist", "Genre", "Track_Popularity")]
library("paran")
paran(z, centile=95, all=T, graph=T)
## 
## Using eigendecomposition of correlation matrix.
## Computing: 10%  20%  30%  40%  50%  60%  70%  80%  90%  100%
## 
## 
## Results of Horn's Parallel Analysis for component retention
## 270 iterations, using the 95 centile estimate
## 
## -------------------------------------------------- 
## Component   Adjusted    Unadjusted    Estimated 
##             Eigenvalue  Eigenvalue    Bias 
## -------------------------------------------------- 
## 1           3.866214    4.305594      0.439379
## 2           1.164901    1.462154      0.297253
## 3           0.783579    0.979100      0.195521
## 4           0.698837    0.813018      0.114181
## 5           0.656256    0.694514      0.038258
## 6           0.331472    0.301951     -0.02952
## 7           0.315614    0.222367     -0.09324
## 8           0.306746    0.140507     -0.16623
## 9           0.319864    0.080790     -0.23907
## -------------------------------------------------- 
## 
## Adjusted eigenvalues > 1 indicate dimensions to retain.
## (2 components retained)

x$pc_1 <- pc_cor$x[,1]
x$pc_2 <- pc_cor$x[,2]
x$Release_Date <- as.Date(paste(x$Release_Date, 1, 1, sep = "-"))
x$days_release_orig <- as.integer(round(difftime('2020-03-01', x$Release_Date, units = "days"), digits = 0))
x$Release_Date <- NULL
x$days_release <- as.numeric(scale(x$days_release_orig))


drop.cols <- c('Artist_ID', 'Genre', 'Track_Artist','Track_ID', 'Track_Title', 'key',
               'mode', 'time_signature', 'video_ID', 'Charts', 'acousticness', 'danceability', 'energy', 'instrumentalness',
               'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'Artist_Popularity', 'Track_Popularity', 'Artist_Popularity_quantile', 'viewsCount_quantile', 'days_release_orig', 'pc_1', 'pc_2')

x_sel <- dplyr::select(x, -one_of(drop.cols))
complete_index <- as.numeric(rownames(x_sel))

z <- scale(x_sel)

# inverse and partial correlations
p  <- solve(cor(z, use="complete.obs"))

# if the model holds then the non-diagonal elements of Rô€€€1
# must be close to zero (relative to the diagonal element)

levelplot(p, scales=list(x=list(rot=90)))

# Partial correlations
pr <- -p/sqrt(outer(diag(p), diag(p)))
levelplot(pr, scales=list(x=list(rot=90)))

KMO(z)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = z)
## Overall MSA =  0.74
## MSA for each item = 
##              Artist_Albums_Number       Artist_Albums_Tracks_Number 
##                              0.81                              0.70 
##         Artist_Appearances_Number  Artist_Appearances_Tracks_Number 
##                              0.56                              0.53 
##        Artist_Compilations_Number Artist_Compilations_Tracks_Number 
##                              0.71                              0.69 
##                   Artist_Follower             Artist_Singles_Number 
##                              0.71                              0.78 
##      Artist_Singles_Tracks_Number                 Track_Duration_ms 
##                              0.60                              0.70 
##                      commentCount                      dislikeCount 
##                              0.85                              0.84 
##                         likeCount                        viewsCount 
##                              0.80                              0.85 
##                      days_release 
##                              0.65
# result: 0.74 overall: middling, every individual MSA value is as least as high as 0.53 (middling)

cortest.bartlett(z) # result: correlation matrix is NOT an identity matrix, so proceed with factor analysis
## R was not square, finding R from data
## $chisq
## [1] 3167.088
## 
## $p.value
## [1] 0
## 
## $df
## [1] 105
scree(z)

library("paran")
paran(z, centile=95, all=T, graph=T) # four factors are appropriate
## 
## Using eigendecomposition of correlation matrix.
## Computing: 10%  20%  30%  40%  50%  60%  70%  80%  90%  100%
## 
## 
## Results of Horn's Parallel Analysis for component retention
## 450 iterations, using the 95 centile estimate
## 
## -------------------------------------------------- 
## Component   Adjusted    Unadjusted    Estimated 
##             Eigenvalue  Eigenvalue    Bias 
## -------------------------------------------------- 
## 1           3.999864    4.590215      0.590350
## 2           3.297602    3.761880      0.464278
## 3           1.660186    2.026943      0.366756
## 4           1.051611    1.333118      0.281506
## 5           0.747370    0.955038      0.207667
## 6           0.510380    0.649312      0.138931
## 7           0.518990    0.598370      0.079379
## 8           0.353892    0.377677      0.023785
## 9           0.271750    0.245019     -0.02673
## 10          0.262400    0.177595     -0.08480
## 11          0.243843    0.106222     -0.13762
## 12          0.286497    0.096959     -0.18953
## 13          0.273755    0.036315     -0.23743
## 14          0.320999    0.025889     -0.29511
## 15          0.375884    0.019441     -0.35644
## -------------------------------------------------- 
## 
## Adjusted eigenvalues > 1 indicate dimensions to retain.
## (4 components retained)

Loadings

pca <- principal(z, nfactors=4, rotate="none") # h2 are "communalities"
pa <- fa(z, nfactors=4, rotate="none", fm="pa") # principal axis extraction
uls <- fa(z, nfactors=4, rotate="none")
ml <- fa(z, nfactors=4, rotate="none", fm="ml")
library("xtable")
# xtable(unclass(ml$loadings))
print(ml$loadings, sort = T)
## 
## Loadings:
##                                   ML2    ML3    ML1    ML4   
## commentCount                       0.965        -0.144       
## dislikeCount                       0.968        -0.161       
## likeCount                          0.980        -0.145       
## viewsCount                         0.943        -0.108       
## Artist_Albums_Number                      0.750  0.503       
## Artist_Albums_Tracks_Number               0.638  0.232  0.587
## Artist_Compilations_Number                0.876  0.365       
## Artist_Compilations_Tracks_Number         0.896  0.408       
## Artist_Singles_Number                     0.358  0.685 -0.206
## Artist_Singles_Tracks_Number             -0.105  0.988       
## Artist_Appearances_Number                        0.212  0.870
## Artist_Appearances_Tracks_Number                 0.231  0.551
## Artist_Follower                    0.335                     
## Track_Duration_ms                                0.268  0.375
## days_release                              0.384  0.106  0.392
## 
##                  ML2   ML3   ML1   ML4
## SS loadings    3.879 2.849 2.315 1.756
## Proportion Var 0.259 0.190 0.154 0.117
## Cumulative Var 0.259 0.449 0.603 0.720
print(ml$loadings, cutoff=.4, sort = T)
## 
## Loadings:
##                                   ML2    ML3    ML1    ML4   
## commentCount                       0.965                     
## dislikeCount                       0.968                     
## likeCount                          0.980                     
## viewsCount                         0.943                     
## Artist_Albums_Number                      0.750  0.503       
## Artist_Albums_Tracks_Number               0.638         0.587
## Artist_Compilations_Number                0.876              
## Artist_Compilations_Tracks_Number         0.896  0.408       
## Artist_Singles_Number                            0.685       
## Artist_Singles_Tracks_Number                     0.988       
## Artist_Appearances_Number                               0.870
## Artist_Appearances_Tracks_Number                        0.551
## Artist_Follower                                              
## Track_Duration_ms                                            
## days_release                                                 
## 
##                  ML2   ML3   ML1   ML4
## SS loadings    3.879 2.849 2.315 1.756
## Proportion Var 0.259 0.190 0.154 0.117
## Cumulative Var 0.259 0.449 0.603 0.720
set.seed(42)
ml.varimax <- fa(z, nfactors=4, rotate="varimax", fm="ml") 
print(ml.varimax$loadings, cutoff=.4, sort = T)
## 
## Loadings:
##                                   ML2    ML3    ML4    ML1   
## commentCount                       0.974                     
## dislikeCount                       0.979                     
## likeCount                          0.989                     
## viewsCount                         0.950                     
## Artist_Albums_Number                      0.857              
## Artist_Compilations_Number                0.948              
## Artist_Compilations_Tracks_Number         0.976              
## Artist_Albums_Tracks_Number               0.619  0.643       
## Artist_Appearances_Number                        0.898       
## Artist_Appearances_Tracks_Number                 0.587       
## Artist_Singles_Number                     0.555         0.579
## Artist_Singles_Tracks_Number                            0.955
## Artist_Follower                                              
## Track_Duration_ms                                0.430       
## days_release                                     0.417       
## 
##                  ML2   ML3   ML4   ML1
## SS loadings    3.922 3.457 2.015 1.404
## Proportion Var 0.261 0.230 0.134 0.094
## Cumulative Var 0.261 0.492 0.626 0.720
# xtable(unclass(ml.varimax$loadings))

fa.congruence(ml, ml.varimax)
##       ML2   ML3   ML4   ML1
## ML2  1.00 -0.07 -0.09 -0.01
## ML3 -0.01  0.98  0.33  0.17
## ML1 -0.20  0.69  0.45  0.89
## ML4 -0.03  0.14  0.96 -0.09
par(mfrow=c(1,1))

threshold <- 0.4
plot(ml.varimax$loadings[,1], ml.varimax$loadings[,2], xlim = c(-1, 1), ylim = c(-1,1.1), xlab = 'ML2', ylab = 'ML3')
palette <- c('red', 'blue')
col_index <- ifelse((abs(ml.varimax$loadings[,1]) > threshold) | (abs(ml.varimax$loadings[,2]) > threshold), 1, 0)
cols <- palette[as.numeric(as.factor(col_index))]
points(ml.varimax$loadings[,1], ml.varimax$loadings[,2], pch=19, col=cols)
abline(h = 0, v= 0)
text(ml.varimax$loadings[,1:2], as.character(rownames(ml.varimax$loadings)), pos = 1, cex = 0.6, offset = 0.5)
rect(xleft = -threshold, ybottom = -threshold, xright = threshold, ytop = threshold)

# fa.diagram(ml.varimax, simple=TRUE, cut=.2, digits=2)
set.seed(42)
ml.promax <- fa(z, nfactors=4, rotate="promax", fm="ml")
## Loading required namespace: GPArotation
fa.congruence(ml.promax, ml.varimax)
##       ML2   ML3   ML4   ML1
## ML2  1.00 -0.01 -0.03 -0.08
## ML3 -0.01  0.99  0.19  0.29
## ML4  0.00  0.09  0.97  0.10
## ML1 -0.09  0.24  0.12  0.99
ml.promax$Phi
##             ML2        ML3        ML4         ML1
## ML2  1.00000000 -0.1234826 -0.1397866  0.07121945
## ML3 -0.12348257  1.0000000  0.3651684  0.16587388
## ML4 -0.13978664  0.3651684  1.0000000 -0.00887250
## ML1  0.07121945  0.1658739 -0.0088725  1.00000000
print(ml.promax$loadings, cutoff=.4, sort = T)
## 
## Loadings:
##                                   ML2    ML3    ML4    ML1   
## commentCount                       0.984                     
## dislikeCount                       0.990                     
## likeCount                          0.999                     
## viewsCount                         0.962                     
## Artist_Albums_Number                      0.860              
## Artist_Albums_Tracks_Number               0.552  0.545       
## Artist_Compilations_Number                0.999              
## Artist_Compilations_Tracks_Number         1.018              
## Artist_Appearances_Number                        0.940       
## Artist_Appearances_Tracks_Number                 0.643       
## Artist_Singles_Number                     0.535         0.541
## Artist_Singles_Tracks_Number                            0.956
## Artist_Follower                                              
## Track_Duration_ms                                0.436       
## days_release                                                 
## 
##                  ML2   ML3   ML4   ML1
## SS loadings    3.989 3.519 2.033 1.346
## Proportion Var 0.266 0.235 0.136 0.090
## Cumulative Var 0.266 0.501 0.636 0.726
#xtable(unclass(ml.promax$loadings))

print(ml.promax$Structure, cutoff = 0.4)
## 
## Loadings:
##                                   ML2    ML3    ML4    ML1   
## Artist_Albums_Number                      0.889              
## Artist_Albums_Tracks_Number               0.721  0.744       
## Artist_Appearances_Number                        0.891       
## Artist_Appearances_Tracks_Number                 0.565       
## Artist_Compilations_Number                0.941              
## Artist_Compilations_Tracks_Number         0.980              
## Artist_Follower                                              
## Artist_Singles_Number                     0.582         0.632
## Artist_Singles_Tracks_Number                            0.958
## Track_Duration_ms                                0.449       
## commentCount                       0.974                     
## dislikeCount                       0.979                     
## likeCount                          0.989                     
## viewsCount                         0.947                     
## days_release                              0.426  0.477       
## 
##                  ML2   ML3   ML4   ML1
## SS loadings    3.969 3.890 2.427 1.548
## Proportion Var 0.265 0.259 0.162 0.103
## Cumulative Var 0.265 0.524 0.686 0.789
# For orthoghonal rotations: cor(item, factor) = loadings ("pattern") matrix = structure matrix, 
# since structure matrix = loadings matrix *%* factor intercorrelation matrix (which is identity for orthogonal rotations)
# example: ml.promax$loadings %*% ml.promax$Phi = structure matrix


par(mfrow=c(1,1))

threshold <- 0.5
plot(ml.promax$Structure[,1], ml.promax$Structure[,2], xlim = c(-1, 1), ylim = c(-1,1.1), xlab = 'ML2', ylab = 'ML3')
palette <- c('red', 'blue')
col_index <- ifelse((abs(ml.promax$Structure[,1]) > threshold) | (abs(ml.promax$Structure[,2]) > threshold), 1, 0)
cols <- palette[as.numeric(as.factor(col_index))]
points(ml.promax$Structure[,1], ml.promax$Structure[,2], pch=19, col=cols)
abline(h = 0, v= 0)
text(ml.promax$Structure[,1:2], as.character(rownames(ml.promax$Structure)), pos = 1, cex = 0.6, offset = 0.5)
rect(xleft = -threshold, ybottom = -threshold, xright = threshold, ytop = threshold)

fa1 <-factanal(z, factors=4, scores = 'regression', lower = 0.1) # ML with Kaiser normalization 
head(fa1$scores)
##       Factor1   Factor2    Factor3    Factor4
## 1 -0.09032514 4.7229399 -1.1982826  1.1913023
## 2 -0.06532926 1.9592530  0.6796187  1.9294387
## 3 -0.17155114 1.2161697  2.5043243 -0.6176557
## 4 -0.14220859 4.8213406 -1.0227099  0.2499034
## 5 -0.15666866 6.0158376 -1.7644434 -0.1556914
## 6 -0.16905143 0.1190256  2.0571621 -0.1523273
fa2 <- fa(z, nfactors=4) # oblimin rotation without Kaiser normalization
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
head(fa2$scores)
##          MR2       MR1        MR3        MR4
## 1 -0.1257348 4.9076402 -1.4064326  3.2185555
## 2 -0.2186062 2.3024182  0.7123318  2.1407078
## 3 -0.2078290 1.3247000  2.3948606 -0.3356255
## 4 -0.1849004 4.2448612  0.0405285  1.8486009
## 5 -0.2345250 5.4651211 -0.7047564  1.0788027
## 6 -0.2682063 0.3496226  1.8536951  0.1795223
cor(fa1$scores, fa2$scores)
##                 MR2         MR1         MR3        MR4
## Factor1  0.99066727 -0.04263156 -0.03183455 0.02879820
## Factor2 -0.02300522  0.98601395  0.12472840 0.27124108
## Factor3 -0.03546517  0.11316250  0.97914604 0.09737382
## Factor4 -0.05458080  0.13298949 -0.03618965 0.93878256
cor(fa1$scores, ml.promax$scores)
##                  ML2         ML3         ML4         ML1
## Factor1  0.995994444 -0.06746742 -0.08311111 0.060236052
## Factor2 -0.051288886  0.97618626  0.22252324 0.086347208
## Factor3 -0.056169205  0.17796182  0.97556026 0.002189822
## Factor4  0.007625069  0.11489683  0.02445173 0.989366259
cor(ml.promax$scores, ml.varimax$scores)
##             ML2         ML3          ML4        ML1
## ML2  0.99714527 -0.05043569 -0.058887859 0.01154380
## ML3 -0.06617328  0.97785070  0.193263920 0.08735674
## ML4 -0.08094263  0.19810346  0.979273972 0.03916156
## ML1  0.06253243  0.10010835 -0.004175889 0.99271103
fa_bartlett_scores <- fa(z, nfactors=4, rotate="varimax", fm="ml", scores = 'Bartlett')
cor(ml.varimax$scores, fa_bartlett_scores$scores)
##               ML2           ML3          ML4           ML1
## ML2  9.999945e-01 -1.574145e-18 8.383016e-17 -1.545713e-17
## ML3  4.392591e-17  9.999300e-01 7.699237e-16 -1.960363e-16
## ML4  3.900769e-18  7.222804e-19 9.995550e-01  3.371781e-17
## ML1 -5.920793e-18 -3.169430e-16 5.198842e-16  9.996269e-01
cor(fa_bartlett_scores$scores)
##               ML2           ML3          ML4           ML1
## ML2  1.0000000000  0.0003036624  0.003257071 -0.0006236104
## ML3  0.0003036624  1.0000000000 -0.011669369 -0.0016210978
## ML4  0.0032570710 -0.0116693685  1.000000000 -0.0272409055
## ML1 -0.0006236104 -0.0016210978 -0.027240905  1.0000000000
# sum scores

threshold <- 0.5

vars_1 <- abs(ml.promax$Structure[,1])>threshold
vars_2 <- abs(ml.promax$Structure[,2])>threshold
vars_3 <- abs(ml.promax$Structure[,3])>threshold
vars_4 <- abs(ml.promax$Structure[,4])>threshold

key.list <- list(one = as.numeric(which(vars_1==1)), 
                two = as.numeric(which(vars_2==1)), 
                three = as.numeric(which(vars_3==1)), 
                four = as.numeric(which(vars_4==1)))

sign.mat <- cbind(sign(ml.promax$Structure[,1]), 
                      sign(ml.promax$Structure[,2]),
                      sign(ml.promax$Structure[,3]),
                      sign(ml.promax$Structure[,4]))

keys <- make.keys(z, key.list,item.labels = colnames(z))

si <- scoreItems(keys * sign.mat, z) # these are the scores I finally use

pairs(cbind(si$scores[,1], si$scores[,2], si$scores[,3], si$scores[,4]))

cor(cbind(si$scores[,1], si$scores[,2], si$scores[,3], si$scores[,4]))
##             [,1]        [,2]        [,3]        [,4]
## [1,]  1.00000000 -0.07190494 -0.04542303 -0.04038714
## [2,] -0.07190494  1.00000000  0.42361028  0.62898318
## [3,] -0.04542303  0.42361028  1.00000000  0.19564730
## [4,] -0.04038714  0.62898318  0.19564730  1.00000000
print(ml.promax$loadings, cutoff = 0.5, sort = T)
## 
## Loadings:
##                                   ML2    ML3    ML4    ML1   
## commentCount                       0.984                     
## dislikeCount                       0.990                     
## likeCount                          0.999                     
## viewsCount                         0.962                     
## Artist_Albums_Number                      0.860              
## Artist_Albums_Tracks_Number               0.552  0.545       
## Artist_Compilations_Number                0.999              
## Artist_Compilations_Tracks_Number         1.018              
## Artist_Appearances_Number                        0.940       
## Artist_Appearances_Tracks_Number                 0.643       
## Artist_Singles_Number                     0.535         0.541
## Artist_Singles_Tracks_Number                            0.956
## Artist_Follower                                              
## Track_Duration_ms                                            
## days_release                                                 
## 
##                  ML2   ML3   ML4   ML1
## SS loadings    3.989 3.519 2.033 1.346
## Proportion Var 0.266 0.235 0.136 0.090
## Cumulative Var 0.266 0.501 0.636 0.726
# I apply a threshold of 0.5 for manual computation of sum scores

f2 <- (z[,11] + z[,12] + z[,13] + z[,14])/4 # commentCount, dislikeCount, likeCount, viewsCount
f3 <- (z[,1] + z[,2] + z[,5])/3 # Artist_Albums_Number, Artist_Compilations_Number, Artist_Compilations_Tracks_Number
f4 <- (z[,2] + z[,3] + z[,4])/3 # Artist_Albums_Tracks_Number, Artist_Appearances_Number, Artist_Appearances_Tracks_Number
f1 <- (z[,8] + z[,9])/2 # Artist_Singles_Number, Artist_Singles_Tracks_Number

pairs(cbind(f2, f3, f4, f1))

cor(cbind(f2, f3, f4, f1))
##             f2          f3          f4          f1
## f2  1.00000000 -0.08246012 -0.04542303 -0.04038714
## f3 -0.08246012  1.00000000  0.51683413  0.47961885
## f4 -0.04542303  0.51683413  1.00000000  0.19564730
## f1 -0.04038714  0.47961885  0.19564730  1.00000000
psych::alpha(cor(z[,vars_1]), check.keys = T) # commentCount, dislikeCount, likeCount, viewsCount
## 
## Reliability analysis   
## Call: psych::alpha(x = cor(z[, vars_1]), check.keys = T)
## 
##   raw_alpha std.alpha G6(smc) average_r S/N median_r
##       0.99      0.99    0.99      0.95  74     0.96
## 
##  Reliability if an item is dropped:
##              raw_alpha std.alpha G6(smc) average_r S/N   var.r med.r
## commentCount      0.98      0.98    0.98      0.95  55 7.2e-04  0.95
## dislikeCount      0.98      0.98    0.98      0.94  49 9.3e-04  0.95
## likeCount         0.98      0.98    0.97      0.93  42 1.2e-03  0.92
## viewsCount        0.99      0.99    0.98      0.97  96 1.5e-05  0.97
## 
##  Item statistics 
##                 r r.cor r.drop
## commentCount 0.98  0.97   0.97
## dislikeCount 0.98  0.98   0.97
## likeCount    0.99  0.99   0.99
## viewsCount   0.96  0.95   0.94
psych::alpha(cor(z[,vars_2]), check.keys = T) # Artist_Albums_Number, Artist_Albums_Tracks_Number, Artist_Compilations_Number, Artist_Compilations_Tracks_Number, Artist_Singles_Number
## 
## Reliability analysis   
## Call: psych::alpha(x = cor(z[, vars_2]), check.keys = T)
## 
##   raw_alpha std.alpha G6(smc) average_r S/N median_r
##       0.91      0.91    0.92      0.66 9.7     0.63
## 
##  Reliability if an item is dropped:
##                                   raw_alpha std.alpha G6(smc) average_r  S/N
## Artist_Albums_Number                   0.86      0.86    0.88      0.60  6.1
## Artist_Albums_Tracks_Number            0.92      0.92    0.92      0.75 11.7
## Artist_Compilations_Number             0.86      0.86    0.87      0.61  6.2
## Artist_Compilations_Tracks_Number      0.85      0.85    0.85      0.59  5.7
## Artist_Singles_Number                  0.92      0.92    0.93      0.75 12.1
##                                   var.r med.r
## Artist_Albums_Number              0.047  0.60
## Artist_Albums_Tracks_Number       0.024  0.72
## Artist_Compilations_Number        0.039  0.63
## Artist_Compilations_Tracks_Number 0.034  0.61
## Artist_Singles_Number             0.023  0.74
## 
##  Item statistics 
##                                      r r.cor r.drop
## Artist_Albums_Number              0.93  0.92   0.89
## Artist_Albums_Tracks_Number       0.73  0.64   0.59
## Artist_Compilations_Number        0.93  0.94   0.88
## Artist_Compilations_Tracks_Number 0.95  0.97   0.92
## Artist_Singles_Number             0.72  0.62   0.58
psych::alpha(cor(z[,vars_3]), check.keys = T) # Artist_Albums_Tracks_Number, Artist_Appearances_Number, Artist_Appearances_Tracks_Number, Track_Duration_ms 
## 
## Reliability analysis   
## Call: psych::alpha(x = cor(z[, vars_3]), check.keys = T)
## 
##   raw_alpha std.alpha G6(smc) average_r S/N median_r
##       0.74      0.74    0.69      0.48 2.8     0.52
## 
##  Reliability if an item is dropped:
##                                  raw_alpha std.alpha G6(smc) average_r  S/N
## Artist_Albums_Tracks_Number           0.69      0.69    0.52      0.52 2.18
## Artist_Appearances_Number             0.47      0.47    0.31      0.31 0.88
## Artist_Appearances_Tracks_Number      0.76      0.76    0.62      0.62 3.21
##                                  var.r med.r
## Artist_Albums_Tracks_Number         NA  0.52
## Artist_Appearances_Number           NA  0.31
## Artist_Appearances_Tracks_Number    NA  0.62
## 
##  Item statistics 
##                                     r r.cor r.drop
## Artist_Albums_Tracks_Number      0.79  0.65   0.53
## Artist_Appearances_Number        0.88  0.82   0.70
## Artist_Appearances_Tracks_Number 0.75  0.55   0.46
psych::alpha(cor(z[,vars_4]), check.keys = T) # Artist_Singles_Number, Artist_Singles_Tracks_Number
## Warning in matrix(unlist(drop.item), ncol = 10, byrow = TRUE): Datenlänge [16]
## ist kein Teiler oder Vielfaches der Anzahl der Spalten [10]
## 
## Reliability analysis   
## Call: psych::alpha(x = cor(z[, vars_4]), check.keys = T)
## 
##   raw_alpha std.alpha G6(smc) average_r S/N median_r
##       0.78      0.78    0.65      0.65 3.6     0.65
## 
##  Reliability if an item is dropped:
##                              raw_alpha std.alpha G6(smc) average_r S/N var.r
## Artist_Singles_Number             0.65      0.65    0.42      0.65  NA  0.65
## Artist_Singles_Tracks_Number      0.42      0.65      NA        NA  NA  0.42
##                              med.r
## Artist_Singles_Number         0.65
## Artist_Singles_Tracks_Number  0.65
## 
##  Item statistics 
##                                 r r.cor r.drop
## Artist_Singles_Number        0.91  0.73   0.65
## Artist_Singles_Tracks_Number 0.91  0.73   0.65
items_1 <- reverse.code((keys[,1] * sign.mat[,1])[(keys[,1] * sign.mat[,1]) != 0], z[, vars_1])
items_2 <- reverse.code((keys[,2] * sign.mat[,2])[(keys[,2] * sign.mat[,2]) != 0], z[, vars_2])
items_3 <- reverse.code((keys[,3] * sign.mat[,3])[(keys[,3] * sign.mat[,3]) != 0], z[, vars_3])
items_4 <- reverse.code((keys[,4] * sign.mat[,4])[(keys[,4] * sign.mat[,4]) != 0], z[, vars_4])

library("additivityTests")

tukey.test(items_1) # result: H0 rejected, i.e. sum score is insufficient to represent the variables (factor 2)
## 
## Tukey test on 5% alpha-level:
## 
## Test statistic: 16.19 
## Critival value: 3.857 
## The additivity hypothesis was rejected.
tukey.test(items_2) # result: H0 rejected (factor 3)
## 
## Tukey test on 5% alpha-level:
## 
## Test statistic: 9.276 
## Critival value: 3.853 
## The additivity hypothesis was rejected.
tukey.test(items_3) # result: H0 cannot be rejected (factor 4)
## 
## Tukey test on 5% alpha-level:
## 
## Test statistic: 0.3105 
## Critival value: 3.865 
## The additivity hypothesis cannot be rejected.
tukey.test(items_4) # result: H0 cannot be rejected (factor 1)
## 
## Tukey test on 5% alpha-level:
## 
## Test statistic: 0.004627 
## Critival value: 3.89 
## The additivity hypothesis cannot be rejected.
cor(ml.promax$scores, si$scores)
##             one         two        three         four
## ML2  0.99592621 -0.09140514 -0.079332072 -0.002099964
## ML3 -0.09594147  0.97336006  0.416099225  0.487503635
## ML4 -0.11258045  0.39562748  0.960537575  0.200024816
## ML1  0.01023158  0.29765356 -0.008730584  0.882141227
fa_bartlett_scores <- fa(z, nfactors=4, rotate="promax", fm="ml", scores = 'Bartlett')
cor(ml.promax$scores, fa_bartlett_scores$scores)
##             ML2        ML3          ML4          ML1
## ML2  0.99999331 -0.1228703 -0.132348818  0.071013967
## ML3 -0.12409086  0.9999497  0.347443877  0.166211182
## ML4 -0.14750946  0.3834347  0.999105855 -0.009335731
## ML1  0.07138876  0.1654449 -0.008420436  0.999492034
cor(si$scores, fa_bartlett_scores$scores)
##                ML2         ML3         ML4         ML1
## one    0.995760702 -0.09422492 -0.09953451  0.01023589
## two   -0.091504358  0.97260263  0.36373011  0.29006661
## three -0.082455588  0.42220226  0.95965169 -0.03857534
## four  -0.001308527  0.48171167  0.16518758  0.87773348
cor(cbind(f2, f3, f4, f1), si$scores) # are almost identical now
##            one         two       three        four
## f2  1.00000000 -0.07190494 -0.04542303 -0.04038714
## f3 -0.08246012  0.97220279  0.51683413  0.47961885
## f4 -0.04542303  0.42361028  1.00000000  0.19564730
## f1 -0.04038714  0.62898318  0.19564730  1.00000000
cor(cbind(f2, f3, f4, f1), fa_bartlett_scores$scores)
##             ML2         ML3         ML4         ML1
## f2  0.995760702 -0.09422492 -0.09953451  0.01023589
## f3 -0.113694191  0.96801245  0.47604146  0.14747026
## f4 -0.082455588  0.42220226  0.95965169 -0.03857534
## f1 -0.001308527  0.48171167  0.16518758  0.87773348
cor(si$scores, fa_bartlett_scores$scores)
##                ML2         ML3         ML4         ML1
## one    0.995760702 -0.09422492 -0.09953451  0.01023589
## two   -0.091504358  0.97260263  0.36373011  0.29006661
## three -0.082455588  0.42220226  0.95965169 -0.03857534
## four  -0.001308527  0.48171167  0.16518758  0.87773348
# I choose the manually computed scoreItems and sum scores omitting <= 0.5 structure loadings and assigning the item with the larger loading to a single factor in case of ambiguity

par(mfrow=c(1,2))
hist(ml.promax$scores[,1], main = 'Youtube popularity factor', xlab = 'Score corresponding to factor 2 (promax)')
rug(ml.promax$scores[ifelse(x[complete_index, 'Charts'] == 1, TRUE, FALSE), 1], col="blue")
rug(ml.promax$scores[ifelse(x[complete_index, 'Charts'] != 1, TRUE, FALSE), 1], col="red")

hist(ml.promax$scores[,2], main = 'Music supply factor', xlab = 'Score corresponding to factor 3 (promax)')
rug(ml.promax$scores[ifelse(x[complete_index, 'Charts'] == 1, TRUE, FALSE), 2], col="blue")
rug(ml.promax$scores[ifelse(x[complete_index, 'Charts'] != 1, TRUE, FALSE), 2], col="red")

par(mfrow=c(1,2))
hist(si$scores[,1], main = 'Youtube popularity factor', xlab = 'Score corresponding to factor 2 (promax)')
rug(si$scores[ifelse(x[complete_index, 'Charts'] == 1, TRUE, FALSE), 1], col="blue")
rug(si$scores[ifelse(x[complete_index, 'Charts'] != 1, TRUE, FALSE), 1], col="red")

hist(si$scores[,2], main = 'Music supply factor', xlab = 'Score corresponding to factor 3 (promax)')
rug(si$scores[ifelse(x[complete_index, 'Charts'] == 1, TRUE, FALSE), 2], col="blue")
rug(si$scores[ifelse(x[complete_index, 'Charts'] != 1, TRUE, FALSE), 2], col="red")

par(mfrow=c(1,2))
hist(f2, main = 'Youtube popularity factor', xlab = 'Score corresponding to factor 2 (promax)')
rug(f2[ifelse(x[complete_index, 'Charts'] == 1, TRUE, FALSE)], col="blue")
rug(f2[ifelse(x[complete_index, 'Charts'] != 1, TRUE, FALSE)], col="red")

hist(f3, main = 'Music supply factor', xlab = 'Score corresponding to factor 3 (promax)')
rug(f3[ifelse(x[complete_index, 'Charts'] == 1, TRUE, FALSE)], col="blue")
rug(f3[ifelse(x[complete_index, 'Charts'] != 1, TRUE, FALSE)], col="red")

x$factor_2 <- si$scores[,1]
x$factor_3 <- si$scores[,2]
x$factor_4 <- si$scores[,3]
x$factor_1 <- si$scores[,4]

x$factor_2man <- f2
x$factor_3man <- f3
x$factor_4man <- f4
x$factor_1man <- f1

cor(cbind(si$scores[,1], si$scores[,2], si$scores[,3], si$scores[,4]), cbind(f2, f3, f4, f1))
##               f2          f3          f4          f1
## [1,]  1.00000000 -0.08246012 -0.04542303 -0.04038714
## [2,] -0.07190494  0.97220279  0.42361028  0.62898318
## [3,] -0.04542303  0.51683413  1.00000000  0.19564730
## [4,] -0.04038714  0.47961885  0.19564730  1.00000000
cor(cbind(si$scores[,1], si$scores[,2], si$scores[,3], si$scores[,4]), fa_bartlett_scores$scores)
##               ML2         ML3         ML4         ML1
## [1,]  0.995760702 -0.09422492 -0.09953451  0.01023589
## [2,] -0.091504358  0.97260263  0.36373011  0.29006661
## [3,] -0.082455588  0.42220226  0.95965169 -0.03857534
## [4,] -0.001308527  0.48171167  0.16518758  0.87773348
cor(cbind(si$scores[,1], si$scores[,2], si$scores[,3], si$scores[,4]), ml.promax$scores)
##               ML2         ML3        ML4          ML1
## [1,]  0.995926214 -0.09594147 -0.1125804  0.010231584
## [2,] -0.091405140  0.97336006  0.3956275  0.297653556
## [3,] -0.079332072  0.41609922  0.9605376 -0.008730584
## [4,] -0.002099964  0.48750364  0.2000248  0.882141227
x$factor_2bartlett <- fa_bartlett_scores$scores[,1]
x$factor_3bartlett <- fa_bartlett_scores$scores[,2]
x$factor_4bartlett <- fa_bartlett_scores$scores[,3]
x$factor_1bartlett <- fa_bartlett_scores$scores[,4]
z <- scale(x_features)

library(factoextra)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# Elbow method
fviz_nbclust(z, kmeans, method = "wss") +
  geom_vline(xintercept = 4, linetype = 2) + labs(title = "")

set.seed(42)
kz <- kmeans(z, c=4)

k_techno <- names(kz$cluster[kz$cluster == 1])
k_hiphop <- names(kz$cluster[kz$cluster == 2])
k_pop <- names(kz$cluster[kz$cluster == 3])
k_classic <- names(kz$cluster[kz$cluster == 4])

kz$cluster[k_techno] <- 4
kz$cluster[k_pop] <- 3
kz$cluster[k_hiphop] <- 2
kz$cluster[k_classic] <- 1

par(mfcol=c(1,1))
plot(pc_cor$x[,1], pc_cor$x[,2], col=kz$cluster, pch=19, xlab = 'PC1', ylab = 'PC2')

# project cluster centers to PCA feature space:

cluster_centers_pca <- kz$centers %*% pc_cor$rotation[, 1:2]
points(cluster_centers_pca[,1], cluster_centers_pca[,2], col = 'magenta', pch = 10, cex = 3)

plot(pc_cor$x[,1], pc_cor$x[,2], col=as.numeric(x$Genre), pch=19, xlab = 'PC1', ylab = 'PC2')

library("cluster")

sk_4 <- silhouette(kz$cluster, dist(z))
plot(sk_4, col=1:4, border=NA, ann = T, main = "")

library("caret")
## 
## Attaching package: 'caret'
## The following object is masked _by_ '.GlobalEnv':
## 
##     RMSE
confusionMatrix(as.factor(as.numeric(kz$cluster)), as.factor(as.numeric(x$Genre)))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3  4
##          1  1  1  0 47
##          2  0 34 30  0
##          3 48  0  3  0
##          4  0 15 17  0
## 
## Overall Statistics
##                                          
##                Accuracy : 0.1939         
##                  95% CI : (0.141, 0.2563)
##     No Information Rate : 0.2551         
##     P-Value [Acc > NIR] : 0.9821         
##                                          
##                   Kappa : -0.0767        
##                                          
##  Mcnemar's Test P-Value : <2e-16         
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity          0.020408   0.6800  0.06000   0.0000
## Specificity          0.673469   0.7945  0.67123   0.7852
## Pos Pred Value       0.020408   0.5312  0.05882   0.0000
## Neg Pred Value       0.673469   0.8788  0.67586   0.7134
## Prevalence           0.250000   0.2551  0.25510   0.2398
## Detection Rate       0.005102   0.1735  0.01531   0.0000
## Detection Prevalence 0.250000   0.3265  0.26020   0.1633
## Balanced Accuracy    0.346939   0.7373  0.36562   0.3926
# Although k-means is not a classification algorithm, we can compute the proportions of genres per cluster to assign
# an interpretation for each cluster.

tab_kmeans <- table(kz$cluster, x$Genre)
#xtable(round(tab_kmeans/colSums(tab_kmeans), digits = 2)*100)
round(tab_kmeans/colSums(tab_kmeans), digits = 2)*100
##    
##     Classic Hip Hop Pop Techno
##   1       2       2   0     96
##   2       0      68  60      0
##   3      96       0   6      0
##   4       0      32  36      0
# It seems that Classic music and Techno music can be accurately distinguished but between Hip Hop and Pop there are 
# many false negatives, e.g. 62% of the tracks were predicted to be of the genre Pop although the truth was that they were
# of class Hip Hop, underlining the ambiguity between those genres in terms of music theoretical characteristics nowadays.
# Vice versa, 16% of the tracks predicted to be Hip Hop music were actually Pop music.
set.seed(42)
cl2 <- pam(z, 4)
plot(pc_cor$x[,1], pc_cor$x[,2], col=cl2$clustering)

k_classic <- names(cl2$clustering[cl2$clustering == 1])
k_techno <- names(cl2$clustering[cl2$clustering == 2])
k_pop <- names(cl2$clustering[cl2$clustering == 3])
k_hiphop <- names(cl2$clustering[cl2$clustering == 4])

cl2$clustering[k_techno] <- 4
cl2$clustering[k_pop] <- 3
cl2$clustering[k_hiphop] <- 2
cl2$clustering[k_classic] <- 1


par(mfcol=c(1,1))
plot(pc_cor$x[,1], pc_cor$x[,2], col=cl2$clustering, pch=19, xlab = 'PC1', ylab = 'PC2')
points(pc_cor$x[rownames(cl2$medoids),1], pc_cor$x[rownames(cl2$medoids),2], col = 'magenta', pch = 10, cex = 3)

plot(pc_cor$x[,1], pc_cor$x[,2], col=as.numeric(x$Genre), pch=19, xlab = 'PC1', ylab = 'PC2')

# xtable(x[rownames(cl2$medoids), c('Track_Title', 'Track_Artist', 'Genre')])

x[rownames(cl2$medoids), c('Track_Title', 'Track_Artist', 'Genre')]
sm_4 <- silhouette(cl2$clustering, dist(z))
plot(sm_4, col=1:4, border=NA, main = "")

confusionMatrix(as.factor(as.numeric(cl2$clustering)), as.factor(as.numeric(x$Genre)))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3  4
##          1 48  0  2  0
##          2  0 28 13  0
##          3  0 21 35  0
##          4  1  1  0 47
## 
## Overall Statistics
##                                          
##                Accuracy : 0.8061         
##                  95% CI : (0.7437, 0.859)
##     No Information Rate : 0.2551         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.7415         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity            0.9796   0.5600   0.7000   1.0000
## Specificity            0.9864   0.9110   0.8562   0.9866
## Pos Pred Value         0.9600   0.6829   0.6250   0.9592
## Neg Pred Value         0.9932   0.8581   0.8929   1.0000
## Prevalence             0.2500   0.2551   0.2551   0.2398
## Detection Rate         0.2449   0.1429   0.1786   0.2398
## Detection Prevalence   0.2551   0.2092   0.2857   0.2500
## Balanced Accuracy      0.9830   0.7355   0.7781   0.9933
tab_kmedoids <- table(cl2$clustering, x$Genre)
# xtable((round(tab_kmedoids/colSums(tab_kmedoids), digits = 2))*100)
round(tab_kmedoids/colSums(tab_kmedoids), digits = 2)*100
##    
##     Classic Hip Hop Pop Techno
##   1      98       0   4      0
##   2       0      56  26      0
##   3       0      42  70      0
##   4       2       2   0    100
# Factor analysis + k-means on scores

library("psych")
KMO(z)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = z)
## Overall MSA =  0.83
## MSA for each item = 
##     acousticness     danceability           energy instrumentalness 
##             0.83             0.92             0.80             0.67 
##         liveness         loudness      speechiness            tempo 
##             0.81             0.87             0.84             0.82 
##          valence 
##             0.80
scree(z)

library("paran")
paran(z, centile=95, all=T, graph=T) # two factors are appropriate
## 
## Using eigendecomposition of correlation matrix.
## Computing: 10%  20%  30%  40%  50%  60%  70%  80%  90%  100%
## 
## 
## Results of Horn's Parallel Analysis for component retention
## 270 iterations, using the 95 centile estimate
## 
## -------------------------------------------------- 
## Component   Adjusted    Unadjusted    Estimated 
##             Eigenvalue  Eigenvalue    Bias 
## -------------------------------------------------- 
## 1           3.846102    4.305594      0.459491
## 2           1.166438    1.462154      0.295715
## 3           0.791006    0.979100      0.188093
## 4           0.708123    0.813018      0.104895
## 5           0.657852    0.694514      0.036661
## 6           0.328989    0.301951     -0.02703
## 7           0.317386    0.222367     -0.09501
## 8           0.304598    0.140507     -0.16409
## 9           0.313412    0.080790     -0.23262
## -------------------------------------------------- 
## 
## Adjusted eigenvalues > 1 indicate dimensions to retain.
## (2 components retained)

fa <- fa(z, nfactors=2, rotate="varimax")
library("plot.matrix")
## Registered S3 method overwritten by 'plot.matrix':
##   method        from
##   plot.loadings pls
plot(loadings(fa), las=1)

al1 <- psych::alpha(z[,c("acousticness", "danceability", "energy", "loudness")], check.keys = TRUE)
## Warning in psych::alpha(z[, c("acousticness", "danceability", "energy", : Some items were negatively correlated with total scale and were automatically reversed.
##  This is indicated by a negative sign for the variable name.
al1
## 
## Reliability analysis   
## Call: psych::alpha(x = z[, c("acousticness", "danceability", "energy", 
##     "loudness")], check.keys = TRUE)
## 
##   raw_alpha std.alpha G6(smc) average_r S/N    ase mean   sd median_r
##       0.95      0.95    0.94      0.82  18 0.0062 -0.6 0.93     0.81
## 
##  lower alpha upper     95% confidence boundaries
## 0.94 0.95 0.96 
## 
##  Reliability if an item is dropped:
##               raw_alpha std.alpha G6(smc) average_r S/N alpha se   var.r med.r
## acousticness-      0.93      0.93    0.90      0.81  13   0.0092 0.00225  0.80
## danceability       0.94      0.94    0.93      0.85  16   0.0073 0.00298  0.86
## energy             0.92      0.92    0.88      0.79  11   0.0102 0.00058  0.79
## loudness           0.94      0.94    0.92      0.84  15   0.0078 0.00246  0.81
## 
##  Item statistics 
##                 n raw.r std.r r.cor r.drop     mean sd
## acousticness- 196  0.94  0.94  0.92   0.89 -2.4e+00  1
## danceability  196  0.91  0.91  0.86   0.84 -5.5e-18  1
## energy        196  0.96  0.96  0.95   0.92  1.6e-16  1
## loudness      196  0.92  0.92  0.88   0.85  6.0e-17  1
al2 <- psych::alpha(z[,c("instrumentalness", "valence")], check.keys = TRUE)
## Warning in psych::alpha(z[, c("instrumentalness", "valence")], check.keys = TRUE): Some items were negatively correlated with total scale and were automatically reversed.
##  This is indicated by a negative sign for the variable name.
## Warning in matrix(unlist(drop.item), ncol = 10, byrow = TRUE): Datenlänge [16]
## ist kein Teiler oder Vielfaches der Anzahl der Spalten [10]
al2
## 
## Reliability analysis   
## Call: psych::alpha(x = z[, c("instrumentalness", "valence")], check.keys = TRUE)
## 
##   raw_alpha std.alpha G6(smc) average_r S/N  ase mean   sd median_r
##       0.79      0.79    0.65      0.65 3.7 0.03 0.62 0.91     0.65
## 
##  lower alpha upper     95% confidence boundaries
## 0.73 0.79 0.85 
## 
##  Reliability if an item is dropped:
##                   raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r
## instrumentalness-      0.65      0.65    0.42      0.65  NA       NA  0.65
## valence                0.42      0.65      NA        NA  NA       NA  0.42
##                   med.r
## instrumentalness-  0.65
## valence            0.65
## 
##  Item statistics 
##                     n raw.r std.r r.cor r.drop    mean sd
## instrumentalness- 196  0.91  0.91  0.73   0.65 1.2e+00  1
## valence           196  0.91  0.91  0.73   0.65 1.4e-17  1
scale <- cbind(al1$scores, al2$scores)

fviz_nbclust(scale, kmeans, method = "wss") +
  geom_vline(xintercept = 4, linetype = 2)+
  labs(subtitle = "Elbow method")

reorder <- function(x, ...) {
  dr <- dist(x)
  hr <- hclust(dr)
  dc <- dist(t(x))
  hc <- hclust(dc)
  x[hr$order, hc$order]
}

set.seed(42)
cl1 <- kmeans(z, 4)
set.seed(42)
cl2 <- kmeans(scale, 4)

print(reorder(table(cl1$cluster,cl2$cluster)))
##    
##      1  3  2  4
##   3 51  0  0  0
##   4  0 49  0  0
##   1  0  0 14 18
##   2  0  0 42 22
par(mfcol=c(1,3))

plot(pc_cor$x[,1], pc_cor$x[,2], col=cl1$cluster, pch=19)
plot(pc_cor$x[,1], pc_cor$x[,2], col=cl2$cluster, pch=19)
plot(pc_cor$x[,1], pc_cor$x[,2], col=as.numeric(x$Genre), pch=19)

par(mfcol=c(1,2))
library("e1071")
set.seed(42)
cl1 <- cmeans(z, 4)
mcolor <- colorRamp(c("black", "green", "blue", "red"))
col <- rgb(mcolor(cl1$membership[,1]), max=255)
plot(pc_cor$x[,1], pc_cor$x[,2], pch=19, col=col, main = 'Soft-clustering (fuzzy c-means, k = 4)')

cl  <- diana(z)
hcl <- cutree(as.hclust(cl), k = 4)
par(mfrow=c(1,1))
plot(pc_cor$x[,1], pc_cor$x[,2], col=hcl, 
     pch=19, cex=0.5)

library("proxy")
## 
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix
d <- as.matrix(dist(z))
heatmap(d)

s <- pr_dist2simil(d) # the darker, the more similar
heatmap(s)

par(mfcol=c(1,1))

d <- dist(z)
# hclust
cl1 <- hclust(d, method = 'ward.D2')
memb <- cutree(cl1, 4)
plot(pc_cor$x[,1], pc_cor$x[,2], col=memb)

plot(cl1, ann = F)
title(xlab = "Ward.D2", ylab = "Height")

library("ggplot2")
library("tibble")

ggplot(cl1$height %>%
as_tibble() %>%
add_column(groups = length(cl1$height):1) %>%
rename(height=value),
aes(x=groups, y=height)) +
geom_point() +
geom_line()
## Warning: Calling `as_tibble()` on a vector is discouraged, because the behavior is likely to change in the future. Use `tibble::enframe(name = NULL)` instead.
## This warning is displayed once per session.

library("mclust")
## Package 'mclust' version 5.4.5
## Type 'citation("mclust")' for citing this R package in publications.
## 
## Attaching package: 'mclust'
## The following object is masked from 'package:MVN':
## 
##     mvn
## The following object is masked from 'package:psych':
## 
##     sim
set.seed(42)
cl <- Mclust(z)
summary(cl)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VVI (diagonal, varying volume and shape) model with 9 components: 
## 
##  log-likelihood   n  df       BIC       ICL
##       -232.2578 196 170 -1361.795 -1368.181
## 
## Clustering table:
##  1  2  3  4  5  6  7  8  9 
##  9 23  7 35  7 65 24  8 18
par(mfrow=c(2,2))
plot(cl, "BIC")
plot(pc_cor$x[,1], pc_cor$x[,2], col = cl$classification)
EM_density_plot <- plot(cl, "density")

par(mfcol=c(2,2))

library("fpc")
## 
## Attaching package: 'fpc'
## The following object is masked from 'package:xtable':
## 
##     xtable
set.seed(42)
cl <- dbscan(z, 0.5, scale=F, MinPts = 5)
col <- c('grey', rainbow(max(cl$cluster)))
plot(pc_cor$x[,1], pc_cor$x[,2], pch=19, col=col[1+cl$cluster])

set.seed(42)
cl <- dbscan(z, 1, scale=F, MinPts = 5)
col <- c('grey', rainbow(max(cl$cluster)))
plot(pc_cor$x[,1], pc_cor$x[,2], pch=19, col=col[1+cl$cluster])

set.seed(42)
cl <- dbscan(z, 1.5, scale=F, MinPts = 5)
col <- c('grey', rainbow(max(cl$cluster)))
plot(pc_cor$x[,1], pc_cor$x[,2], pch=19, col=col[1+cl$cluster])

set.seed(42)
cl <- dbscan(z, 2, scale=F, MinPts = 5)
col <- c('grey', rainbow(max(cl$cluster)))
plot(pc_cor$x[,1], pc_cor$x[,2], pch=19, col=col[1+cl$cluster])

# Mixed clustering


# hclust
d    <- dist(z)
set.seed(42)
cl1  <- hclust(d, method="ward.D2")
memb <- cutree(cl1, 4)
# kmeans
groupm  <- aggregate(z, list(memb), mean)
centers <- cbind(groupm$acousticness, 
                 groupm$danceability, 
                 groupm$energy, 
                 groupm$instrumentalness,
                 groupm$liveness,
                 groupm$loudness,
                 groupm$speechiness,
                 groupm$tempo,
                 groupm$valence)
set.seed(42)
cl2 <- kmeans(z, centers = centers)
# compare results (confusion matrix)
table(memb, cl2$cluster) # result: k-means has classified 4 observations differently than hclust
##     
## memb  1  2  3  4
##    1 51  0  0  1
##    2  0 49  0  0
##    3  0  0  7  0
##    4  0  0  3 85
par(mfcol=c(1,2))
plot(pc_cor$x[,1], pc_cor$x[,2], col=cl2$cluster,
     main="Cluster predictions, mixed clustering", pch=19, xlab = 'PC1', ylab = 'PC2')

# project cluster centers to PCA feature space:

cluster_centers_pca <- cl2$centers %*% pc_cor$rotation[, 1:2]
points(cluster_centers_pca[,1], cluster_centers_pca[,2], col = 'magenta', pch = 10, cex = 3)

plot(pc_cor$x[,1], pc_cor$x[,2], col=as.numeric(x$Genre),
     main="Truth", pch=19, xlab = 'PC1', ylab = 'PC2')

library("NbClust")

# Total variance explained
tve <- rep(NA, 15)
for (k in 2:15) {
  clk <- kmeans(z, k)
  tve[k] <- 1-clk$tot.withinss/clk$totss
}
plot(tve, type="b")

set.seed(42)
NbClust(z, method="ward.D2", index="ch") # result: k*=2
## $All.index
##        2        3        4        5        6        7        8        9 
## 139.7710 120.4259  98.3300  88.7335  82.5706  74.8123  69.3249  65.2113 
##       10       11       12       13       14       15 
##  61.9272  59.6528  58.1524  57.0914  56.0043  55.3023 
## 
## $Best.nc
## Number_clusters     Value_Index 
##           2.000         139.771 
## 
## $Best.partition
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   1   1   1   1   1   1   1   2   2   2   2   2   2   2   2   2   2   1   2   2 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   2   2   2   2   2   2   2   2   2   2   2   2   1   2   2   2   1   2   2   2 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   2   2   2   2   2   2   2   2   2   2   2   2   1   2   2   2   2   2   2   2 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   1   1   1   1   1   1 
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 141 142 143 145 146 147 148 149 150 152 153 154 155 156 157 158 159 160 161 162 
##   1   1   1   1   2   1   1   2   2   2   2   2   2   2   2   2   2   2   2   2 
## 163 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 183 184 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
## 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 
##   2   2   2   2   2   2   2   1   1   1   1   1   1   1   1   1
set.seed(42)
NbClust(z, method="ward.D2") # result: majority vote (12) for k*=3.

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 8 proposed 2 as the best number of clusters 
## * 12 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 1 proposed 14 as the best number of clusters 
## * 2 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************
## $All.index
##        KL       CH Hartigan     CCC     Scott      Marriot    TrCovW    TraceW
## 2  2.9225 139.7710  59.4773  3.0748  463.1206 4.072864e+17 21596.043 1020.0706
## 3  3.5131 120.4259  24.7670  6.5366  826.6680 1.433964e+17 11111.650  780.7158
## 4  0.8659  98.3300  24.3660  7.3358  952.5966 1.340864e+17  8423.499  691.9237
## 5  1.1693  88.7335  21.0237  9.1153 1104.3665 9.658613e+16  6515.744  614.0028
## 6  2.4339  82.5706  12.1013 11.0072 1213.6275 7.964860e+16  4756.855  553.1201
## 7  0.9542  74.8123  11.5500 10.5538 1366.8443 4.961074e+16  4347.806  520.0008
## 8  1.0040  69.3249  10.9477 10.3915 1447.6622 4.290265e+16  3883.982  490.0530
## 9  1.0561  65.2113  10.1985 10.3909 1533.7264 3.500170e+16  3428.487  463.0863
## 10 0.8937  61.9272  10.6113 10.4543 1598.4194 3.106406e+16  2948.095  439.1369
## 11 0.9214  59.6528  11.0369 10.7463 1689.2379 2.364879e+16  2610.542  415.4363
## 12 0.9994  58.1524  10.9827 11.8133 1812.3952 1.501397e+16  2480.136  392.0472
## 13 1.1530  57.0914   9.8993 12.5013 1911.2783 1.063937e+16  2136.940  369.9646
## 14 0.9733  56.0043  10.0901 13.0538 1965.2543 9.368841e+15  1826.549  350.9785
## 15 1.3913  55.3023   7.9632 13.7026 2032.3638 7.636806e+15  1567.078  332.5424
##    Friedman  Rubin Cindex     DB Silhouette   Duda Pseudot2   Beale Ratkowsky
## 2   21.7400 1.7205 0.3120 0.9783     0.4265 0.7137  56.9638  2.3921    0.4113
## 3   28.4878 2.2479 0.2692 1.1655     0.3482 0.8329  18.6544  1.1917    0.4042
## 4   29.5447 2.5364 0.3313 1.1514     0.3628 0.8040  20.9664  1.4472    0.3785
## 5   31.1993 2.8583 0.2878 1.4486     0.3458 0.6822  16.3044  2.7196    0.3548
## 6   32.3856 3.1729 0.3217 1.2612     0.3517 0.8201  10.9717  1.2919    0.3361
## 7   37.4071 3.3750 0.3024 1.4710     0.2876 0.7661  14.9600  1.7967    0.3153
## 8   39.0800 3.5812 0.2885 1.5297     0.2588 0.6165   8.0877  3.4690    0.2988
## 9   40.7503 3.7898 0.2840 1.4543     0.2668 0.6327  27.2817  3.4130    0.2849
## 10  42.0453 3.9965 0.2731 1.4046     0.2679 0.6075  12.9223  3.6951    0.2731
## 11  44.1287 4.2245 0.2665 1.3700     0.2704 0.8028   9.5795  1.4381    0.2628
## 12  48.7617 4.4765 0.2489 1.3929     0.2545 0.7169  11.0589  2.2899    0.2538
## 13  52.5702 4.7437 0.2390 1.3446     0.2621 0.6343  12.6834  3.3114    0.2459
## 14  53.4624 5.0003 0.2303 1.2754     0.2686 1.7043  -2.0662 -2.0679    0.2387
## 15  54.9323 5.2775 0.2808 1.2230     0.2738 0.6088   6.4256  3.5078    0.2322
##        Ball Ptbiserial    Frey McClain   Dunn Hubert SDindex Dindex   SDbw
## 2  510.0353     0.6596  1.5894  0.3791 0.1672 0.0014  1.3874 2.1143 0.6318
## 3  260.2386     0.5824 -0.0150  1.0162 0.1346 0.0014  1.4240 1.7562 0.5419
## 4  172.9809     0.6279  0.6392  1.0868 0.1782 0.0015  1.6964 1.6896 0.6632
## 5  122.8006     0.5917  0.1784  1.6348 0.1280 0.0016  1.9833 1.5926 0.7320
## 6   92.1867     0.5988  0.3878  1.7244 0.1504 0.0017  1.7666 1.5282 0.5752
## 7   74.2858     0.5900  0.9796  1.9096 0.1504 0.0018  1.8074 1.4831 0.5428
## 8   61.2566     0.5491  0.0245  2.3354 0.1264 0.0018  2.0264 1.4334 0.4935
## 9   51.4540     0.5521  0.2609  2.3378 0.1264 0.0018  1.9033 1.4024 0.4559
## 10  43.9137     0.5485  0.1750  2.4413 0.1264 0.0018  1.8289 1.3667 0.4188
## 11  37.7669     0.5484  0.3502  2.4821 0.1264 0.0019  1.7465 1.3323 0.3863
## 12  32.6706     0.5333  0.2567  2.7392 0.1264 0.0020  1.7621 1.2967 0.3768
## 13  28.4588     0.5279  0.1186  2.8500 0.1264 0.0021  1.6812 1.2591 0.3519
## 14  25.0699     0.5285 -0.0290  2.8823 0.1264 0.0021  1.6455 1.2326 0.3438
## 15  22.1695     0.5298  0.1067  2.8742 0.1535 0.0022  1.8933 1.2092 0.2842
## 
## $All.CriticalValues
##    CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2          0.8094            33.4492       0.0110
## 3          0.7816            25.9821       0.2967
## 4          0.7759            24.8432       0.1638
## 5          0.6927            15.5269       0.0046
## 6          0.7297            18.5198       0.2388
## 7          0.7278            18.3290       0.0668
## 8          0.5577            10.3089       0.0008
## 9          0.7237            17.9442       0.0005
## 10         0.6225            12.1297       0.0003
## 11         0.7045            16.3556       0.1702
## 12         0.6665            14.0075       0.0174
## 13         0.6355            12.6164       0.0009
## 14         0.3854             7.9739       1.0000
## 15         0.5139             9.4601       0.0009
## 
## $Best.nc
##                     KL      CH Hartigan     CCC    Scott    Marriot   TrCovW
## Number_clusters 3.0000   2.000   3.0000 15.0000   3.0000 3.0000e+00     3.00
## Value_Index     3.5131 139.771  34.7103 13.7026 363.5475 2.5458e+17 10484.39
##                   TraceW Friedman  Rubin  Cindex     DB Silhouette   Duda
## Number_clusters   3.0000   3.0000  3.000 14.0000 2.0000     2.0000 3.0000
## Value_Index     150.5628   6.7478 -0.239  0.2303 0.9783     0.4265 0.8329
##                 PseudoT2  Beale Ratkowsky     Ball PtBiserial   Frey McClain
## Number_clusters   3.0000 3.0000    2.0000   3.0000     2.0000 2.0000  2.0000
## Value_Index      18.6544 1.1917    0.4113 249.7967     0.6596 1.5894  0.3791
##                   Dunn Hubert SDindex Dindex    SDbw
## Number_clusters 4.0000      0  2.0000      0 15.0000
## Value_Index     0.1782      0  1.3874      0  0.2842
## 
## $Best.partition
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   1   1   1   1   1   1   1   2   2   2   2   2   2   2   3   3   3   1   3   3 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   3   3   3   3   3   3   3   3   3   3   3   3   1   3   3   3   1   3   3   3 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   3   3   3   3   3   3   3   3   3   3   3   3   1   3   3   3   3   3   3   3 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   3   3   3   3   3   3   3   3   3   3   2   3   3   3   3   3   3   3   3   3 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
##   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3 
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
##   3   3   3   3   3   3   3   3   3   3   3   3   3   3   1   1   1   1   1   1 
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 141 142 143 145 146 147 148 149 150 152 153 154 155 156 157 158 159 160 161 162 
##   1   1   1   1   2   1   1   2   2   2   2   2   2   2   2   2   2   2   2   2 
## 163 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 183 184 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
## 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 
##   2   2   2   2   2   2   2   1   1   1   1   1   1   1   1   1
# Silhouette method
set.seed(42)
fviz_nbclust(z, kmeans, method = "silhouette")+
  labs(subtitle = "Silhouette method")

d <- dist(z)
cl1 <- hclust(d, method="ward.D2")
memb2 <- cutree(cl1, 2)
memb3 <- cutree(cl1, 3)
memb4 <- cutree(cl1, 4)

library("cluster")
par(mfcol=c(2,2))
plot(pc_cor$x[,1], pc_cor$x[,2], col=memb2)
s2 <- silhouette(memb2, d)
plot(s2, col=1:2, border=NA)
plot(pc_cor$x[,1], pc_cor$x[,2], col=memb3)
s3 <- silhouette(memb3, d)
plot(s3, col=1:3, border=NA)

par(mfcol=c(1,3))
clusplot(z, memb2, col.p=memb2)
clusplot(z, memb3, col.p=memb3)
clusplot(z, memb4, col.p=memb4)

library("cluster")
set.seed(42)
cl2 <- pam(z, 4)
k_classic <- names(cl2$clustering[cl2$clustering == 1])
k_techno <- names(cl2$clustering[cl2$clustering == 2])
k_pop <- names(cl2$clustering[cl2$clustering == 3])
k_hiphop <- names(cl2$clustering[cl2$clustering == 4])

cl2$clustering[k_techno] <- 4
cl2$clustering[k_pop] <- 3
cl2$clustering[k_hiphop] <- 2
cl2$clustering[k_classic] <- 1

x$cluster <- as.factor(cl2$clustering)
x$Artist_Follower <- x$Artist_Follower/1000000

x$fa_1 <- x$factor_1
x$fa_2 <- x$factor_2
x$fa_3 <- x$factor_3
x$fa_4 <- x$factor_4


par(mfrow=c(3,3))
nx <- c("fa_1", "fa_2", "fa_3", "fa_4", "pc_1", "pc_2", "Artist_Follower", "days_release_orig", "Track_Popularity")
for (i in 1:length(nx)) {
  lmi <- lm(as.formula(paste('Artist_Popularity', nx[i], sep="~")), data=x)
  summary(lmi)
  plot(x[,nx[i]], x[,'Artist_Popularity'], xlab=nx[i], ylab='Artist Popularity', main=sprintf("R^2=%.2f", summary(lmi)$r.squared))
  abline(lmi, col="red")
}

model1 <- lm(Artist_Popularity ~ fa_1 + fa_2 + fa_3 + fa_4 + pc_1 + pc_2 + Artist_Follower + days_release_orig, data = x)

summary(model1)
## 
## Call:
## lm(formula = Artist_Popularity ~ fa_1 + fa_2 + fa_3 + fa_4 + 
##     pc_1 + pc_2 + Artist_Follower + days_release_orig, data = x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -28.4051  -6.6722   0.3102   7.0904  21.0544 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       64.5143111  0.8690218  74.238  < 2e-16 ***
## fa_1              -0.0816610  1.1386538  -0.072 0.942904    
## fa_2               1.8480927  0.7571880   2.441 0.015589 *  
## fa_3               5.4094057  1.3919701   3.886 0.000141 ***
## fa_4              -1.2986520  1.2829029  -1.012 0.312714    
## pc_1               1.4023625  0.5912068   2.372 0.018706 *  
## pc_2               7.4542956  0.6570917  11.344  < 2e-16 ***
## Artist_Follower    0.8090864  0.1199206   6.747 1.84e-10 ***
## days_release_orig -0.0004646  0.0002093  -2.220 0.027631 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.765 on 187 degrees of freedom
## Multiple R-squared:  0.6364, Adjusted R-squared:  0.6208 
## F-statistic: 40.91 on 8 and 187 DF,  p-value: < 2.2e-16
# fa_2: "Youtube"
# fa_3: "Artist content supply"
# fa_4: "Artist long-term activity"
# fa_1: "One hit wonder"

# pc_1 (between classic and Pop, Hip Hop, Techno): strong negative loadings: acousticness and instrumentalness, strong positive loadings: danceability, energy and loudness
# pc_2 (between Techno and Pop, Hip Hop): strong negative loadings: instrumentalness and tempo, strong positive loadings: speechiness and valence

vif(model1) # fa_1, fa_3, fa_4, pc_1 mild collinearity
##              fa_1              fa_2              fa_3              fa_4 
##          2.181823          1.127092          2.881682          2.202112 
##              pc_1              pc_2   Artist_Follower days_release_orig 
##          3.077405          1.290976          1.276809          1.607455
library("perturb") # condition index
colldiag(model1)
## Condition
## Index    Variance Decomposition Proportions
##          intercept fa_1  fa_2  fa_3  fa_4  pc_1  pc_2  Artist_Follower
## 1  1.000 0.002     0.017 0.003 0.030 0.035 0.031 0.000 0.000          
## 2  1.314 0.106     0.011 0.066 0.002 0.000 0.002 0.071 0.129          
## 3  1.462 0.003     0.135 0.054 0.013 0.014 0.008 0.093 0.103          
## 4  1.562 0.203     0.000 0.220 0.006 0.015 0.004 0.141 0.006          
## 5  1.914 0.002     0.036 0.381 0.041 0.049 0.017 0.373 0.025          
## 6  2.154 0.035     0.007 0.265 0.055 0.194 0.002 0.031 0.405          
## 7  2.926 0.635     0.006 0.006 0.000 0.069 0.042 0.099 0.121          
## 8  3.154 0.000     0.412 0.007 0.169 0.323 0.301 0.085 0.204          
## 9  3.912 0.015     0.376 0.000 0.683 0.301 0.593 0.107 0.007          
##   days_release_orig
## 1 0.029            
## 2 0.036            
## 3 0.023            
## 4 0.030            
## 5 0.005            
## 6 0.048            
## 7 0.739            
## 8 0.069            
## 9 0.022
# I remove some variables according to insignificance (fa_1, fa_4)

model2 <- lm(Artist_Popularity ~ fa_2 + fa_3 + pc_1 + pc_2 + Artist_Follower + days_release_orig, data = x)
# summary(model2)
vif(model2)
##              fa_2              fa_3              pc_1              pc_2 
##          1.126103          1.674037          2.021619          1.082232 
##   Artist_Follower days_release_orig 
##          1.177174          1.510313
select.cols <- c("Artist_Popularity", "fa_2", "fa_3", "pc_1", "pc_2", "Artist_Follower", "days_release_orig")

z <- as.data.frame(scale(dplyr::select(x, select.cols)))
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(select.cols)` instead of `select.cols` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
model2_z <- lm(Artist_Popularity ~ fa_2 + fa_3 + pc_1 + pc_2 + Artist_Follower + days_release_orig, data = z)
# summary(model2_z)
vif(model2_z)
##              fa_2              fa_3              pc_1              pc_2 
##          1.126103          1.674037          2.021619          1.082232 
##   Artist_Follower days_release_orig 
##          1.177174          1.510313
model2_z$coefficients^2
##       (Intercept)              fa_2              fa_3              pc_1 
##      8.251688e-33      1.318360e-02      8.530193e-02      5.212851e-02 
##              pc_2   Artist_Follower days_release_orig 
##      3.226773e-01      1.063958e-01      1.819168e-02
sum(model2_z$coefficients^2) # 60% is the proportion in the variance of y explained by X and 40% of the variance of y is due to the variance of the residuals outside of the model IFF there was zero multicollinearity. In terms of relevance, 32% of variance in y is explained by pc_2, 10% by Artist_Follower, 8% by fa_3
## [1] 0.5978788
sort(round((model2_z$coefficients)^2, digits = 4), decreasing = T)
##              pc_2   Artist_Follower              fa_3              pc_1 
##            0.3227            0.1064            0.0853            0.0521 
## days_release_orig              fa_2       (Intercept) 
##            0.0182            0.0132            0.0000
# Quadratic specification seems appropriate:

# variables without and with quadratic terms (fa_2, pc_2, Artist_Follower)

par(mfcol=c(2,3))

lm_fa2 <- lm(Artist_Popularity ~ fa_2, data=x)
plot(x[,'fa_2'], x[,'Artist_Popularity'], xlab='fa_2', main=sprintf("R^2=%.2f", summary(lm_fa2)$r.squared),
     ylim = c(min(min(fitted(lm_fa2)), min(x$Artist_Popularity)), max(max(fitted(lm_fa2)), max(x$Artist_Popularity))), ylab = 'Artist_Popularity')
abline(lm_fa2, col="red")

lmq_fa2 <- lm(Artist_Popularity ~ poly(fa_2, 2), data=x)
o <- order(x$fa_2)
plot(x[,'fa_2'], x[,'Artist_Popularity'], xlab= 'fa_2 + fa_2^2', main=sprintf("R^2=%.2f", summary(lmq_fa2)$r.squared),
     ylim = c(min(min(fitted(lmq_fa2)), min(x$Artist_Popularity)), max(max(fitted(lmq_fa2)), max(x$Artist_Popularity))), ylab = 'Artist_Popularity')
lines(x$fa_2[o], fitted(lmq_fa2)[o], col="red")

lm_follower <- lm(Artist_Popularity ~ Artist_Follower, data=x)
plot(x[,'Artist_Follower'], x[,'Artist_Popularity'], xlab='Artist_Follower', main=sprintf("R^2=%.2f", summary(lm_follower)$r.squared),
     ylim = c(min(min(fitted(lm_follower)), min(x$Artist_Popularity)), max(max(fitted(lm_follower)), max(x$Artist_Popularity))), ylab = 'Artist_Popularity')
abline(lm_follower, col="red")

lmq_follower <- lm(Artist_Popularity ~ poly(Artist_Follower, 2), data=x)
o <- order(x$Artist_Follower)
plot(x[,'Artist_Follower'], x[,'Artist_Popularity'], xlab= 'Artist_Follower + Artist_Follower^2', main=sprintf("R^2=%.2f", summary(lmq_follower)$r.squared),
     ylim = c(min(min(fitted(lmq_follower)), min(x$Artist_Popularity)), max(max(fitted(lmq_follower)), max(x$Artist_Popularity))), ylab = 'Artist_Popularity')
lines(x$Artist_Follower[o], fitted(lmq_follower)[o], col="red")

lm_pc2 <- lm(Artist_Popularity ~ pc_2, data=x)
plot(x[,'pc_2'], x[,'Artist_Popularity'], xlab='pc_2', main=sprintf("R^2=%.2f", summary(lm_pc2)$r.squared),
     ylim = c(min(min(fitted(lm_pc2)), min(x$Artist_Popularity)), max(max(fitted(lm_pc2)), max(x$Artist_Popularity))), ylab = 'Artist_Popularity')
abline(lm_pc2, col="red")

lmq_pc2 <- lm(Artist_Popularity ~ poly(pc_2, 2), data=x)
o <- order(x$pc_2)
plot(x[,'pc_2'], x[,'Artist_Popularity'], xlab= 'pc_2 + pc_2^2', main=sprintf("R^2=%.2f", summary(lmq_pc2)$r.squared),
     ylim = c(min(min(fitted(lmq_pc2)), min(x$Artist_Popularity)), max(max(fitted(lmq_pc2)), max(x$Artist_Popularity))), ylab = 'Artist_Popularity')
lines(x$pc_2[o], fitted(lmq_pc2)[o], col="red")

# Residuals normality

ks.test(model1$residuals, "pnorm", mean = mean(model1$residuals), sd=sd(model1$residuals))$statistic
##          D 
## 0.04757442
1.3581 / sqrt(length(model1$residuals))
## [1] 0.09700714
ks.test(model2$residuals, "pnorm", mean = mean(model2$residuals), sd=sd(model2$residuals))$statistic
##          D 
## 0.05823339
1.3581 / sqrt(length(model2$residuals))
## [1] 0.09700714
par(mfcol=c(2,2))
plot(model2)

par(mfcol=c(2,3))
nx <- names(model2$model)
for (i in 2:length(model2$model)) {
  plot(model2$model[,i], residuals(model2), xlab=nx[i])
  lines(lowess(model2$model[,i], residuals(model2)), col = "red")
  abline(h = 0, col = "black", lty = 2)
}

par(mfcol=c(1,1))
plot(model2$fitted.values, residuals(model2))
lines(lowess(model2$fitted.values, residuals(model2)), col = "red")
abline(h = 0, col = "black", lty = 2)

# residuals vs. predictors and fitted values suggests nonlinearity of the regression function
par(mfcol=c(2,3))
nx <- names(model2$model)
for (i in 2:length(model2$model)) {
  plot(model2$model[,i], rstandard(model2), xlab=nx[i])
  lines(lowess(model2$model[,i], rstandard(model2)), col = "red")
  # wenn Linie von 0 abweicht: Residuen sind biased, sprich deren Mittelwert nicht 0
  abline(h = 0, col = "black", lty = 2)
}

# standardized residuals vs. fitted values

par(mfcol=c(1,1))
plot(model2$fitted.values, rstandard(model2), main = 'Standardized residuals vs. fitted values')
lines(lowess(model2$fitted.values, rstandard(model2)), col = "red")
abline(h = 0, col = "black", lty = 2)

# Breusch-Pagan Test + Durbin-Watson Test / spherical errors (i.e. iid)
# model2 uses unscaled data
auxiliary_reg <- lm(model2$residuals^2 ~ fa_2 + fa_3 + pc_1 + pc_2 + Artist_Follower + days_release_orig, data = x)
summary(auxiliary_reg)
## 
## Call:
## lm(formula = model2$residuals^2 ~ fa_2 + fa_3 + pc_1 + pc_2 + 
##     Artist_Follower + days_release_orig, data = x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -209.99  -69.77  -26.75   39.15  712.88 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        92.093388   9.620134   9.573  < 2e-16 ***
## fa_2                0.973183   8.480588   0.115  0.90876    
## fa_3              -29.748317  11.887822  -2.502  0.01318 *  
## pc_1               -8.915701   5.369201  -1.661  0.09847 .  
## pc_2                4.904291   6.741245   0.728  0.46782    
## Artist_Follower     3.868671   1.290221   2.998  0.00308 ** 
## days_release_orig  -0.004154   0.002273  -1.828  0.06918 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 109.4 on 189 degrees of freedom
## Multiple R-squared:  0.1067, Adjusted R-squared:  0.0783 
## F-statistic: 3.761 on 6 and 189 DF,  p-value: 0.001467
# R^2: 0.1067
# N = 196, N = nrow(x) 
# test stat: 
summary(auxiliary_reg)$r.square * nrow(x)
## [1] 20.90497
# test stat: 20.90497
p <- ncol(model2$model)-1
qchisq(.95, df = p) # df = number of parameters p (without the constant!), acc. to Klinke: q = 5*(5+3)/2 WTF?
## [1] 12.59159
# p_value = P(z > 12.59159) = 1-pchisq(20.90497, p=6) = 0.001908146 < 0.05 ,  reject H0 of homoskedasticity and conclude there is heteroskedasticity
1-pchisq(20.90497, p)
## [1] 0.001908146
library("lmtest")
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
bptest(model2)
## 
##  studentized Breusch-Pagan test
## 
## data:  model2
## BP = 20.905, df = 6, p-value = 0.001908
# test statistic n*R^2_aux > X^2_crit = qchisq(.95, df = 5), hence reject H0 and conclude that there's
# heteroskedasticity and constancy of error variance does not hold --> inference invalidated

dwtest(model2)
## 
##  Durbin-Watson test
## 
## data:  model2
## DW = 1.9149, p-value = 0.2323
## alternative hypothesis: true autocorrelation is greater than 0
# Durbin Watson test confirms that the error variance-covariance is diagonal, i.e. var(eps | X) =  \Omega * I
# fail to reject H0: rho = 0 (alternative hypothesis is that there is autocorrelation in the errors)
# outlier detection / leverage

par(mfcol=c(1,2))
plot(hatvalues(model2), pch=19, main="Leverage", cex=0.5)
n <- nrow(model2$model)
p <- ncol(model2$model)-1
abline(h=(1:3)*(p+1)/n, col=c("black", "darkred", "red"))

plot(cooks.distance(model2), pch=19, main="Cook's distances",
     cex=0.5)
n <- nrow(model2$model)
p <- ncol(model2$model)-1
abline(h=4/n, col="red")

outlier_index_leverage <- as.numeric(rownames(x[hatvalues(model2) > 3*(p+1)/n, ]))
outlier_index_final <- outlier_index_leverage
length(outlier_index_leverage)
## [1] 9
# There are 10 outliers that might be influential on the regression
x[outlier_index_leverage, c("Track_Title","Track_Artist", "Artist_Popularity", "Artist_Follower", "viewsCount", "pc_1",
                            "pc_2")]
# Rule of thumb Cook's distance
# movement of regression coefficients all together if ith observation is excluded
outlier_index_cooksd <- as.numeric(rownames(x[cooks.distance(model2) > 3*(p+1)/n, ]))
length(outlier_index_cooksd)
## [1] 2
x[outlier_index_cooksd, c("Track_Title","Track_Artist", "Artist_Popularity", "Artist_Follower", "viewsCount", "pc_1",
                            "pc_2")]
# differences

# SDBETA

SDBETA <- dfbetas(model2)
n <- nrow(model2$model)
par(mfcol=c(2,3))
plot(SDBETA[, 'fa_2'], main="fa_2", pch=19)
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'fa_3'], main="fa_3", pch=19)
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'pc_1'], main="pc_1", pch=19)
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'pc_2'], main="pc_2", pch=19)
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'Artist_Follower'], main="Artist_Follower", pch=19)
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'days_release_orig'], main="days_release_orig", pch=19)
abline(h=c(-2,2)/sqrt(n), col="red")

# SDFITS
par(mfcol=c(1,1))
SDFITS <- dffits(model2)
plot(SDFITS, pch=19)
abline(h=c(-1,1), col="red")
n <- nrow(model2$model)
p <- ncol(model2$model)-1
abline(h=c(-2,2)*sqrt(p/n), col="darkred")

# As this is a small data set, it is sufficient to analyze |\delta*y_i| > 1, in this case two observations are identified
# of exceeding this limit. They are

outlier_index_sdfits <- as.numeric(names(which(abs(SDFITS) > 1))) 

x[outlier_index_sdfits, c("Track_Title","Track_Artist", "Artist_Popularity", "Artist_Follower", "viewsCount", "pc_1",
                          "pc_2")]
# Add quadratic terms for fa_2, Artist_Follower and pc_2 first before excluding outliers!

x$fa_2sq <- x$fa_2^2
x$pc_2sq <- x$pc_2^2
x$Artist_Followersq <- x$Artist_Follower^2
# model2: lm(Artist_Popularity ~ fa_2 + fa_3 + pc_1 + pc_2 + Artist_Follower + days_release_orig, data = x)


model3 <- lm(Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + pc_1 + pc_2 + pc_2sq + Artist_Follower + Artist_Followersq + days_release_orig, data = x)
summary(model3)
## 
## Call:
## lm(formula = Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + pc_1 + 
##     pc_2 + pc_2sq + Artist_Follower + Artist_Followersq + days_release_orig, 
##     data = x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -28.3742  -5.6832   0.3771   4.9769  19.6356 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       67.5611323  0.9877844  68.397  < 2e-16 ***
## fa_2               6.0858820  1.7276487   3.523 0.000538 ***
## fa_2sq            -0.5504304  0.1615332  -3.408 0.000803 ***
## fa_3               4.4969138  0.9010981   4.990 1.38e-06 ***
## pc_1               1.8133635  0.4088899   4.435 1.57e-05 ***
## pc_2               5.9511646  0.5374732  11.072  < 2e-16 ***
## pc_2sq            -2.1818759  0.3886134  -5.615 7.08e-08 ***
## Artist_Follower    1.6608731  0.2291803   7.247 1.10e-11 ***
## Artist_Followersq -0.0233799  0.0049095  -4.762 3.84e-06 ***
## days_release_orig -0.0004777  0.0001710  -2.794 0.005745 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.206 on 186 degrees of freedom
## Multiple R-squared:  0.7446, Adjusted R-squared:  0.7322 
## F-statistic: 60.25 on 9 and 186 DF,  p-value: < 2.2e-16
vif(model3)
##              fa_2            fa_2sq              fa_3              pc_1 
##          8.309381          7.584394          1.710158          2.084608 
##              pc_2            pc_2sq   Artist_Follower Artist_Followersq 
##          1.223166          1.286026          6.603869          5.824194 
## days_release_orig 
##          1.519144
ks.test(model3$residuals, "pnorm", mean = mean(model3$residuals), sd=sd(model3$residuals))$statistic
##         D 
## 0.0367905
1.3581 / sqrt(length(model3$residuals))
## [1] 0.09700714
par(mfcol=c(2,2))
plot(model2)

par(mfcol=c(3,3))
nx <- names(model3$model)
for (i in 2:length(model3$model)) {
  plot(model3$model[,i], residuals(model3), xlab=nx[i])
  lines(lowess(model3$model[,i], residuals(model3)), col = "red")
  abline(h = 0, col = "black", lty = 2)
}

par(mfcol=c(1,1))
plot(model3$fitted.values, residuals(model3))
lines(lowess(model3$fitted.values, residuals(model3)), col = "red")
abline(h = 0, col = "black", lty = 2)

# residuals vs. predictors and fitted values suggests accounting for nonlinearity of the regression function has improved

par(mfcol=c(3,3))
nx <- names(model3$model)
for (i in 2:length(model3$model)) {
  plot(model3$model[,i], rstandard(model3), xlab=nx[i])
  lines(lowess(model3$model[,i], rstandard(model3)), col = "red")
  # wenn Linie von 0 abweicht: Residuen sind biased, sprich deren Mittelwert nicht 0
  abline(h = 0, col = "black", lty = 2)
}

# standardized residuals vs. fitted values

par(mfcol=c(1,1))
plot(model3$fitted.values, rstandard(model3), main = 'Standardized residuals vs. fitted values')
lines(lowess(model3$fitted.values, rstandard(model3)), col = "red")
abline(h = 0, col = "black", lty = 2)

# Breusch-Pagan Test + Durbin-Watson Test / spherical errors (i.e. iid)
# model2 uses unscaled data
auxiliary_reg <- lm(model3$residuals^2 ~ fa_2 + fa_2sq + fa_3 + pc_1 + pc_2 + pc_2sq + Artist_Follower + Artist_Followersq + days_release_orig, data = x)
summary(auxiliary_reg)
## 
## Call:
## lm(formula = model3$residuals^2 ~ fa_2 + fa_2sq + fa_3 + pc_1 + 
##     pc_2 + pc_2sq + Artist_Follower + Artist_Followersq + days_release_orig, 
##     data = x)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -98.32 -52.32 -23.00  18.34 719.43 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        78.727326  11.223118   7.015 4.15e-11 ***
## fa_2                2.535328  19.629390   0.129   0.8974    
## fa_2sq             -0.474656   1.835326  -0.259   0.7962    
## fa_3              -24.009503  10.238196  -2.345   0.0201 *  
## pc_1               -6.105726   4.645771  -1.314   0.1904    
## pc_2               -9.200191   6.106722  -1.507   0.1336    
## pc_2sq             -4.239332   4.415390  -0.960   0.3382    
## Artist_Follower    -1.901275   2.603926  -0.730   0.4662    
## Artist_Followersq   0.021242   0.055781   0.381   0.7038    
## days_release_orig  -0.002471   0.001942  -1.272   0.2050    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 93.24 on 186 degrees of freedom
## Multiple R-squared:  0.06758,    Adjusted R-squared:  0.02246 
## F-statistic: 1.498 on 9 and 186 DF,  p-value: 0.1513
# R^2: 0.06758
# N = 196, N = nrow(x) 
# test stat: 
summary(auxiliary_reg)$r.square * nrow(x)
## [1] 13.24499
# test stat: 13.24499
p <- ncol(model3$model)-1
qchisq(.95, df = p) # df = number of parameters p (without the constant!), acc. to Klinke: q = 5*(5+3)/2 WTF?
## [1] 16.91898
# p_value = P(z > 16.91898) = 1-pchisq(13.24499, p=9) = 0.1518304 > 0.05 ,  fail to reject H0 of homoskedasticity, i.e. constant variance of error term
1-pchisq(13.24499, p)
## [1] 0.1518304
library("lmtest")
bptest(model3)
## 
##  studentized Breusch-Pagan test
## 
## data:  model3
## BP = 13.245, df = 9, p-value = 0.1518
# test statistic n*R^2_aux > X^2_crit = qchisq(.95, df = 5), hence fail to reject H0 and conclude that there's
# no heteroskedasticity --> valid inference

dwtest(model3)
## 
##  Durbin-Watson test
## 
## data:  model3
## DW = 2.0528, p-value = 0.5856
## alternative hypothesis: true autocorrelation is greater than 0
# Durbin Watson test confirms that the error variance-covariance is diagonal, i.e. var(eps | X) =  \Omega * I
# fail to reject H0: rho = 0 (alternative hypothesis is that there is autocorrelation in the errors)
# outlier detection / leverage

par(mfcol=c(1,1))
plot(hatvalues(model3), pch=19, cex=0.5)
n <- nrow(model3$model)
p <- ncol(model3$model)-1
abline(h=(1:3)*(p+1)/n, col=c("black", "darkred", "red"))

plot(cooks.distance(model3), pch=19,
     cex=0.5)
n <- nrow(model3$model)
p <- ncol(model3$model)-1
abline(h=4/n, col="red")

outlier_index_leverage <- as.numeric(rownames(x[hatvalues(model3) > 3*(p+1)/n, ]))
outlier_index_final <- outlier_index_leverage
length(outlier_index_leverage)
## [1] 10
# There are 10 outliers that might be influential on the regression
x[outlier_index_leverage, c("Track_Title","Track_Artist", "Genre", "Artist_Popularity", "Artist_Follower", "viewsCount")]
# Rule of thumb Cook's distance
# movement of regression coefficients all together if ith observation is excluded
outlier_index_cooksd <- as.numeric(rownames(x[cooks.distance(model3) > 3*(p+1)/n, ]))
length(outlier_index_cooksd)
## [1] 2
x[outlier_index_cooksd, c("Track_Title","Track_Artist", "Artist_Popularity", "Artist_Follower", "viewsCount", "pc_1",
                            "pc_2")]
# same outliers detected as in model2 even after modification of the model
# differences

# SDBETA

SDBETA <- dfbetas(model3)
n <- nrow(model3$model)
par(mfcol=c(3,3))
plot(SDBETA[, 'fa_2'], main="fa_2", pch=19, ylab = "fa_2")
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'fa_2sq'], main="fa_2sq", pch=19, ylab = "fa_2sq")
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'fa_3'], main="fa_3", pch=19, ylab = "fa_3")
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'pc_1'], main="pc_1", pch=19, ylab = "pc_1")
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'pc_2'], main="pc_2", pch=19, ylab = "pc_2")
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'pc_2sq'], main="pc_2sq", pch=19, ylab = "pc_2sq")
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'Artist_Follower'], main="Artist_Follower", pch=19, ylab = "Artist_Follower")
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'Artist_Followersq'], main="Artist_Followersq", pch=19, ylab = 'Artist_Followersq')
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'days_release_orig'], main="days_release_orig", pch=19, ylab = 'days_release')
abline(h=c(-2,2)/sqrt(n), col="red")

# SDFITS
par(mfcol=c(1,1))
SDFITS <- dffits(model3)
plot(SDFITS, pch=19)
abline(h=c(-1,1), col="red")
n <- nrow(model3$model)
p <- ncol(model3$model)-1
abline(h=c(-2,2)*sqrt(p/n), col="darkred")

# As this is a small data set, it is sufficient to analyze |\delta*y_i| > 1, in this case two observations are identified
# of exceeding this limit. They are

outlier_index_sdfits <- as.numeric(names(which(abs(SDFITS) > 1))) 

x[outlier_index_sdfits, c("Track_Title","Track_Artist", "Artist_Popularity", "Artist_Follower", "viewsCount", "pc_1",
                          "pc_2")]
# Now exclude the outliers from leverage analysis

x_noout <- x[-outlier_index_final, ]

lm_final <- lm(Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + pc_1 + pc_2 + pc_2sq + Artist_Follower + Artist_Followersq + days_release_orig, data = x_noout)
summary(lm_final) #  result: R^2 decreased a little bit
## 
## Call:
## lm(formula = Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + pc_1 + 
##     pc_2 + pc_2sq + Artist_Follower + Artist_Followersq + days_release_orig, 
##     data = x_noout)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -27.4824  -5.2041  -0.3971   4.9615  19.4089 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       67.5778523  1.1556264  58.477  < 2e-16 ***
## fa_2              14.0294272  3.7773779   3.714 0.000274 ***
## fa_2sq            -7.7026797  2.3813018  -3.235 0.001455 ** 
## fa_3               4.1812469  1.1144664   3.752 0.000238 ***
## pc_1               1.5664575  0.4198679   3.731 0.000257 ***
## pc_2               5.2986744  0.5665857   9.352  < 2e-16 ***
## pc_2sq            -1.9828773  0.4235979  -4.681 5.68e-06 ***
## Artist_Follower    4.0311406  0.6231803   6.469 9.45e-10 ***
## Artist_Followersq -0.1626515  0.0347541  -4.680 5.70e-06 ***
## days_release_orig -0.0004396  0.0001751  -2.510 0.012974 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.855 on 176 degrees of freedom
## Multiple R-squared:  0.7448, Adjusted R-squared:  0.7317 
## F-statistic: 57.06 on 9 and 176 DF,  p-value: < 2.2e-16
vif(lm_final)
##              fa_2            fa_2sq              fa_3              pc_1 
##          3.847406          3.207554          1.750377          2.249691 
##              pc_2            pc_2sq   Artist_Follower Artist_Followersq 
##          1.401325          1.436766         13.792858         12.676491 
## days_release_orig 
##          1.497164
# library("stargazer")
# stargazer::stargazer(model1, model2, model3, lm_final)

library("boot")
## 
## Attaching package: 'boot'
## The following object is masked from 'package:robustbase':
## 
##     salinity
## The following object is masked from 'package:lattice':
## 
##     melanoma
## The following object is masked from 'package:car':
## 
##     logit
## The following object is masked from 'package:psych':
## 
##     logit
ci <- confint(lm_final, level = 0.95)

plot.confint <- function (b, b.boot, ci, main) {
  hist(b.boot, main=main); rug(b.boot)
  abline(v=b, col="red"); abline(v=ci, col="blue")
  abline(v=quantile(b.boot, c(0.025, 0.975)), col="green")
}

betahat <- boot(lm_final$model, R=999, 
                function(x, index) { lm(Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + pc_1 + pc_2 + pc_2sq + Artist_Follower + Artist_Followersq + days_release_orig, data = x_noout, subset=index)$coefficients })

par(mfcol=c(2,3))

plot.confint(lm_final$coefficients[1], betahat$t[,1], ci[1,], "Intercept")
plot.confint(lm_final$coefficients[2], betahat$t[,2], ci[2,], "fa_2")
plot.confint(lm_final$coefficients[3], betahat$t[,3], ci[3,], "fa_2sq")
plot.confint(lm_final$coefficients[4], betahat$t[,4], ci[4,], "fa_3")
plot.confint(lm_final$coefficients[5], betahat$t[,5], ci[5,], "pc_1")
plot.confint(lm_final$coefficients[6], betahat$t[,6], ci[6,], "pc_2")

par(mfcol=c(2,2))

plot.confint(lm_final$coefficients[7], betahat$t[,7], ci[7,], "pc_2sq")
plot.confint(lm_final$coefficients[8], betahat$t[,8], ci[8,], "Artist_Follower")
plot.confint(lm_final$coefficients[9], betahat$t[,9], ci[9,], "Artist_Followersq")
plot.confint(lm_final$coefficients[10], betahat$t[,10], ci[10,], "days_release")

model5 <- lm(Artist_Popularity ~ fa_2 + fa_2sq, data = x)
summary(model5)
## 
## Call:
## lm(formula = Artist_Popularity ~ fa_2 + fa_2sq, data = x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -37.506 -10.505   1.289  10.461  36.224 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  66.5658     1.0458  63.649  < 2e-16 ***
## fa_2         18.0348     2.7106   6.654 2.88e-10 ***
## fa_2sq       -1.3864     0.2653  -5.226 4.47e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.2 on 193 degrees of freedom
## Multiple R-squared:  0.2059, Adjusted R-squared:  0.1977 
## F-statistic: 25.03 on 2 and 193 DF,  p-value: 2.167e-10
vertex <- -model5$coefficients[2] / (2*model5$coefficients[3])
vertex
##     fa_2 
## 6.504143
(vertex <= max(x$fa_2)) & (vertex >= min(x$fa_2)) # so vertex is inside of range of Artist_Follower and there exists a quadratic component
## fa_2 
## TRUE
# https://stats.idre.ucla.edu/r/faq/how-can-i-estimate-the-standard-error-of-transformed-regression-parameters-in-r-using-the-delta-method/
# https://www.statalist.org/forums/forum/general-stata-discussion/general/1376936-testing-whether-to-include-a-squared-term
# https://www.statalist.org/forums/forum/general-stata-discussion/general/1344089-how-to-find-the-turning-point-for-a-quadratic-function
# https://www.statalist.org/forums/forum/general-stata-discussion/general/1408413-significance-level-of-quadratic-term
# https://psych.unl.edu/psycrs/statpage/nonlin_eg.pdf ("true" quadratic component vs. skewed predictor)


library("msm") # equivalent to the nlcom function in Stata
## 
## Attaching package: 'msm'
## The following object is masked from 'package:boot':
## 
##     cav
# z-value according to the Delta method

vertex / msm::deltamethod(~ -x2 / (2 * x3), coef(model5), vcov(model5)) # = 12.8421
##    fa_2 
## 12.8421
# CI according to Stata

vertex - qt(0.975, df = 10000000) * msm::deltamethod(~ -x2 / (2 * x3), coef(model5), vcov(model5))
##     fa_2 
## 5.511479
vertex + qt(0.975, df = 10000000) * msm::deltamethod(~ -x2 / (2 * x3), coef(model5), vcov(model5))
##     fa_2 
## 7.496808
n <- nrow(model5$model)
d <- length(model5$coefficients)
df <- n - d

# http://sia.webpopix.org/nonlinearRegression.html

# p_value = P(z > 1.959964) = 1-pt(z, df=df) = 3.270659e-10 < 0.05 <-- reject H0 that vertex is equal to zero

# vertex for fa_2

vertex <- -model3$coefficients[2] / (2*model3$coefficients[3])
vertex
##     fa_2 
## 5.528294
(vertex <= max(x$fa_2)) & (vertex >= min(x$fa_2)) 
## fa_2 
## TRUE
z <- vertex / msm::deltamethod(~ -x2 / (2 * x3), coef(model3), vcov(model3))
z
##     fa_2 
## 8.814632
lower_CI <- vertex - qt(0.975, df = 10000000) * msm::deltamethod(~ -x2 / (2 * x3), coef(model3), vcov(model3))
upper_CI <- vertex + qt(0.975, df = 10000000) * msm::deltamethod(~ -x2 / (2 * x3), coef(model3), vcov(model3))
n <- nrow(model3$model)
d <- length(model3$coefficients)
df <- n - d
1-pt(z, df=df)
##         fa_2 
## 4.440892e-16
# p_value = P(z > 1.959964) = 1-pt(z, df=df) = 4.440892e-16 < 0.05 <-- reject H0 that vertex is equal to zero

# vertex for pc_2

vertex <- -model3$coefficients[6] / (2*model3$coefficients[7])
vertex
##     pc_2 
## 1.363773
(vertex <= max(x$pc_2)) & (vertex >= min(x$pc_2)) 
## pc_2 
## TRUE
z <- vertex / msm::deltamethod(~ -x6 / (2 * x7), coef(model3), vcov(model3))
z
##     pc_2 
## 4.536158
lower_CI <- vertex - qt(0.975, df = 10000000) * msm::deltamethod(~ -x6 / (2 * x7), coef(model3), vcov(model3))
upper_CI <- vertex + qt(0.975, df = 10000000) * msm::deltamethod(~ -x6 / (2 * x7), coef(model3), vcov(model3))
n <- nrow(model3$model)
d <- length(model3$coefficients)
df <- n - d
1-pt(z, df=df)
##         pc_2 
## 5.125243e-06
# p_value = P(z > 1.959964) = 1-pt(z, df=df) = 5.125243e-06 < 0.05 <-- reject H0 that vertex is equal to zero

# vertex for Artist_Follower

vertex <- -model3$coefficients[8] / (2*model3$coefficients[9])
vertex
## Artist_Follower 
##        35.51927
(vertex <= max(x$Artist_Follower)) & (vertex >= min(x$Artist_Follower)) 
## Artist_Follower 
##            TRUE
z <- vertex / msm::deltamethod(~ -x8 / (2 * x9), coef(model3), vcov(model3))
z
## Artist_Follower 
##        9.652667
lower_CI <- vertex - qt(0.975, df = 10000000) * msm::deltamethod(~ -x8 / (2 * x9), coef(model3), vcov(model3))
upper_CI <- vertex + qt(0.975, df = 10000000) * msm::deltamethod(~ -x8 / (2 * x9), coef(model3), vcov(model3))
n <- nrow(model3$model)
d <- length(model3$coefficients)
df <- n - d
1-pt(z, df=df)
## Artist_Follower 
##               0
#plot(Artist_Follower_trans, x$Artist_Popularity)

#min_max_normalize <- function(x)
#{
#  return( (10-1)*((x- min(x)) /(max(x)-min(x))) + 1)
#}

#min_fa_2 <- optimize(ksD, c(-10,10), x=min_max_normalize(x$pc_2))$minimum
#fa_2bc <- bcPower(min_max_normalize(x$pc_2), min_fa_2)

#fa_2bcsqrr <- fa_2bc ** 0.5
#fa_2bcsqrrcenter <- fa_2bcsqrr - mean(fa_2bcsqrr)
#fa_2bcsqrrcentersq <- fa_2bcsqrrcenter ** 2

#cor(cbind(x$Artist_Popularity, fa_2bcsqrrcenter, fa_2bcsqrrcentersq))





#plot(x$fa_2, x$Artist_Popularity)
#plot(x$pc_2, x$Artist_Popularity)
# baseline model:
# forward

lmi <- lm(Artist_Popularity~1, data=lm_final$model)
lmf <- MASS::stepAIC(lmi, scope=~
                fa_2 
               + fa_2sq
               + fa_3 
               + x_noout[,'fa_4'] 
               + pc_1
               + pc_2
               + pc_2sq
               + Artist_Follower
               + Artist_Followersq
               + days_release_orig
               + x_noout[, 'fa_1']
               + x_noout[, 'Track_Duration_ms']
               + x_noout[, 'Track_Popularity']
               ,direction = "forward")
## Start:  AIC=1012.44
## Artist_Popularity ~ 1
## 
##                                  Df Sum of Sq   RSS     AIC
## + pc_2                            1   17048.4 25493  919.20
## + x_noout[, "Track_Duration_ms"]  1   12472.7 30069  949.90
## + Artist_Follower                 1   11518.9 31023  955.71
## + pc_2sq                          1   10312.7 32229  962.81
## + fa_2                            1    8694.3 33848  971.92
## + x_noout[, "Track_Popularity"]   1    7421.3 35120  978.79
## + Artist_Followersq               1    6625.6 35916  982.95
## + pc_1                            1    1481.0 41061 1007.85
## + fa_2sq                          1    1418.9 41123 1008.13
## <none>                                        42542 1012.44
## + days_release_orig               1     414.5 42127 1012.62
## + x_noout[, "fa_4"]               1     398.2 42144 1012.69
## + x_noout[, "fa_1"]               1     176.7 42365 1013.67
## + fa_3                            1     133.5 42408 1013.86
## 
## Step:  AIC=919.2
## Artist_Popularity ~ pc_2
## 
##                                  Df Sum of Sq   RSS    AIC
## + Artist_Follower                 1    7669.0 17824 854.64
## + Artist_Followersq               1    4882.1 20611 881.66
## + x_noout[, "Track_Duration_ms"]  1    4480.6 21013 885.25
## + fa_2                            1    4209.3 21284 887.63
## + x_noout[, "Track_Popularity"]   1    3579.8 21914 893.06
## + pc_2sq                          1    2528.4 22965 901.77
## + days_release_orig               1    2251.5 23242 904.00
## + pc_1                            1    1731.7 23762 908.12
## + x_noout[, "fa_4"]               1    1088.3 24405 913.09
## + fa_2sq                          1     856.0 24637 914.85
## + x_noout[, "fa_1"]               1     736.0 24757 915.75
## <none>                                        25493 919.20
## + fa_3                            1      25.1 25468 921.02
## 
## Step:  AIC=854.64
## Artist_Popularity ~ pc_2 + Artist_Follower
## 
##                                  Df Sum of Sq   RSS    AIC
## + x_noout[, "Track_Popularity"]   1    3198.4 14626 819.85
## + x_noout[, "Track_Duration_ms"]  1    2853.7 14971 824.19
## + Artist_Followersq               1    2136.9 15688 832.89
## + pc_2sq                          1    1360.1 16464 841.88
## + days_release_orig               1    1225.0 16599 843.40
## + fa_2                            1    1157.3 16667 844.15
## + x_noout[, "fa_4"]               1    1050.6 16774 845.34
## + pc_1                            1     843.0 16981 847.63
## <none>                                        17824 854.64
## + fa_2sq                          1      96.1 17728 855.63
## + fa_3                            1      71.3 17753 855.89
## + x_noout[, "fa_1"]               1      19.1 17805 856.44
## 
## Step:  AIC=819.85
## Artist_Popularity ~ pc_2 + Artist_Follower + x_noout[, "Track_Popularity"]
## 
##                                  Df Sum of Sq   RSS    AIC
## + x_noout[, "Track_Duration_ms"]  1   1841.18 12785 796.83
## + Artist_Followersq               1   1806.26 12820 797.34
## + pc_2sq                          1   1150.74 13475 806.61
## + days_release_orig               1    501.06 14125 815.37
## + pc_1                            1    417.59 14208 816.47
## + x_noout[, "fa_4"]               1    282.55 14343 818.23
## + fa_3                            1    198.73 14427 819.31
## <none>                                        14626 819.85
## + fa_2                            1    142.21 14484 820.04
## + x_noout[, "fa_1"]               1     23.05 14603 821.56
## + fa_2sq                          1      7.91 14618 821.75
## 
## Step:  AIC=796.83
## Artist_Popularity ~ pc_2 + Artist_Follower + x_noout[, "Track_Popularity"] + 
##     x_noout[, "Track_Duration_ms"]
## 
##                     Df Sum of Sq   RSS    AIC
## + Artist_Followersq  1   1508.10 11277 775.48
## + pc_2sq             1   1241.24 11544 779.83
## + fa_3               1    664.00 12121 788.91
## + x_noout[, "fa_1"]  1    147.88 12637 796.67
## <none>                           12785 796.83
## + fa_2               1     83.00 12702 797.62
## + days_release_orig  1     71.00 12714 797.79
## + pc_1               1      6.16 12779 798.74
## + x_noout[, "fa_4"]  1      5.55 12779 798.75
## + fa_2sq             1      4.04 12781 798.77
## 
## Step:  AIC=775.48
## Artist_Popularity ~ pc_2 + Artist_Follower + x_noout[, "Track_Popularity"] + 
##     x_noout[, "Track_Duration_ms"] + Artist_Followersq
## 
##                     Df Sum of Sq   RSS    AIC
## + pc_2sq             1   1076.83 10200 758.82
## + fa_3               1    598.01 10679 767.35
## + fa_2sq             1    185.08 11092 774.40
## <none>                           11277 775.48
## + x_noout[, "fa_1"]  1     84.76 11192 776.08
## + days_release_orig  1     36.02 11241 776.89
## + x_noout[, "fa_4"]  1      5.24 11272 777.40
## + pc_1               1      3.80 11273 777.42
## + fa_2               1      0.27 11276 777.48
## 
## Step:  AIC=758.82
## Artist_Popularity ~ pc_2 + Artist_Follower + x_noout[, "Track_Popularity"] + 
##     x_noout[, "Track_Duration_ms"] + Artist_Followersq + pc_2sq
## 
##                     Df Sum of Sq     RSS    AIC
## + fa_3               1   250.781  9949.1 756.19
## + fa_2sq             1   177.173 10022.7 757.56
## + days_release_orig  1   167.813 10032.1 757.73
## <none>                           10199.9 758.82
## + pc_1               1    98.887 10101.0 759.00
## + x_noout[, "fa_1"]  1    81.693 10118.2 759.32
## + x_noout[, "fa_4"]  1    28.307 10171.6 760.30
## + fa_2               1     0.969 10198.9 760.80
## 
## Step:  AIC=756.19
## Artist_Popularity ~ pc_2 + Artist_Follower + x_noout[, "Track_Popularity"] + 
##     x_noout[, "Track_Duration_ms"] + Artist_Followersq + pc_2sq + 
##     fa_3
## 
##                     Df Sum of Sq    RSS    AIC
## + pc_1               1    546.47 9402.6 747.68
## + days_release_orig  1    302.11 9647.0 752.45
## + x_noout[, "fa_4"]  1    190.84 9758.3 754.58
## + fa_2sq             1    159.29 9789.8 755.18
## <none>                           9949.1 756.19
## + x_noout[, "fa_1"]  1      1.79 9947.3 758.15
## + fa_2               1      0.74 9948.4 758.17
## 
## Step:  AIC=747.68
## Artist_Popularity ~ pc_2 + Artist_Follower + x_noout[, "Track_Popularity"] + 
##     x_noout[, "Track_Duration_ms"] + Artist_Followersq + pc_2sq + 
##     fa_3 + pc_1
## 
##                     Df Sum of Sq    RSS    AIC
## + fa_2sq             1   156.732 9245.9 746.55
## + days_release_orig  1   117.363 9285.3 747.34
## <none>                           9402.6 747.68
## + x_noout[, "fa_1"]  1    25.394 9377.2 749.17
## + x_noout[, "fa_4"]  1     4.049 9398.6 749.60
## + fa_2               1     2.462 9400.2 749.63
## 
## Step:  AIC=746.55
## Artist_Popularity ~ pc_2 + Artist_Follower + x_noout[, "Track_Popularity"] + 
##     x_noout[, "Track_Duration_ms"] + Artist_Followersq + pc_2sq + 
##     fa_3 + pc_1 + fa_2sq
## 
##                     Df Sum of Sq    RSS    AIC
## + fa_2               1   191.308 9054.6 744.66
## + days_release_orig  1   105.159 9140.7 746.42
## <none>                           9245.9 746.55
## + x_noout[, "fa_1"]  1    37.651 9208.2 747.79
## + x_noout[, "fa_4"]  1     0.048 9245.8 748.55
## 
## Step:  AIC=744.66
## Artist_Popularity ~ pc_2 + Artist_Follower + x_noout[, "Track_Popularity"] + 
##     x_noout[, "Track_Duration_ms"] + Artist_Followersq + pc_2sq + 
##     fa_3 + pc_1 + fa_2sq + fa_2
## 
##                     Df Sum of Sq    RSS    AIC
## + days_release_orig  1   109.025 8945.6 744.41
## <none>                           9054.6 744.66
## + x_noout[, "fa_1"]  1    36.249 9018.3 745.92
## + x_noout[, "fa_4"]  1     0.008 9054.6 746.66
## 
## Step:  AIC=744.41
## Artist_Popularity ~ pc_2 + Artist_Follower + x_noout[, "Track_Popularity"] + 
##     x_noout[, "Track_Duration_ms"] + Artist_Followersq + pc_2sq + 
##     fa_3 + pc_1 + fa_2sq + fa_2 + days_release_orig
## 
##                     Df Sum of Sq    RSS    AIC
## <none>                           8945.6 744.41
## + x_noout[, "fa_1"]  1    49.184 8896.4 745.38
## + x_noout[, "fa_4"]  1     6.072 8939.5 746.28
summary(lmf)
## 
## Call:
## lm(formula = Artist_Popularity ~ pc_2 + Artist_Follower + x_noout[, 
##     "Track_Popularity"] + x_noout[, "Track_Duration_ms"] + Artist_Followersq + 
##     pc_2sq + fa_3 + pc_1 + fa_2sq + fa_2 + days_release_orig, 
##     data = lm_final$model)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -23.0380  -5.2954   0.3417   4.5413  17.3011 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     5.987e+01  3.045e+00  19.664  < 2e-16 ***
## pc_2                            4.357e+00  5.531e-01   7.877 3.48e-13 ***
## Artist_Follower                 4.143e+00  5.731e-01   7.229 1.48e-11 ***
## x_noout[, "Track_Popularity"]   1.936e-01  4.086e-02   4.737 4.48e-06 ***
## x_noout[, "Track_Duration_ms"] -1.528e-05  4.591e-06  -3.329 0.001065 ** 
## Artist_Followersq              -1.614e-01  3.182e-02  -5.071 1.01e-06 ***
## pc_2sq                         -1.667e+00  3.918e-01  -4.255 3.41e-05 ***
## fa_3                            3.769e+00  1.024e+00   3.682 0.000308 ***
## pc_1                            9.810e-01  4.271e-01   2.297 0.022806 *  
## fa_2sq                         -5.659e+00  2.205e+00  -2.566 0.011127 *  
## fa_2                            7.210e+00  3.700e+00   1.948 0.052974 .  
## days_release_orig              -2.378e-04  1.633e-04  -1.456 0.147127    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.17 on 174 degrees of freedom
## Multiple R-squared:  0.7897, Adjusted R-squared:  0.7764 
## F-statistic: 59.41 on 11 and 174 DF,  p-value: < 2.2e-16
# backward:
lma <- lm(Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + fa_4 + pc_1 + pc_2 + pc_2sq + Artist_Follower 
          + Artist_Followersq + fa_1 + Track_Duration_ms
          + days_release_orig + Track_Popularity, 
          data= x_noout)

lmb <- MASS::stepAIC(lma)
## Start:  AIC=747.24
## Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + fa_4 + pc_1 + pc_2 + 
##     pc_2sq + Artist_Follower + Artist_Followersq + fa_1 + Track_Duration_ms + 
##     days_release_orig + Track_Popularity
## 
##                     Df Sum of Sq     RSS    AIC
## - fa_4               1      6.65  8896.4 745.38
## - fa_1               1     49.76  8939.5 746.28
## <none>                            8889.7 747.24
## - days_release_orig  1    128.61  9018.3 747.92
## - fa_2               1    195.49  9085.2 749.29
## - pc_1               1    271.47  9161.2 750.84
## - fa_2sq             1    355.61  9245.3 752.54
## - Track_Duration_ms  1    536.66  9426.4 756.15
## - fa_3               1    615.81  9505.5 757.70
## - pc_2sq             1    830.66  9720.4 761.86
## - Track_Popularity   1   1166.70 10056.4 768.18
## - Artist_Followersq  1   1363.69 10253.4 771.79
## - pc_2               1   2572.29 11462.0 792.51
## - Artist_Follower    1   2719.78 11609.5 794.89
## 
## Step:  AIC=745.38
## Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + pc_1 + pc_2 + pc_2sq + 
##     Artist_Follower + Artist_Followersq + fa_1 + Track_Duration_ms + 
##     days_release_orig + Track_Popularity
## 
##                     Df Sum of Sq     RSS    AIC
## - fa_1               1     49.18  8945.6 744.41
## <none>                            8896.4 745.38
## - days_release_orig  1    121.96  9018.3 745.92
## - fa_2               1    193.77  9090.1 747.39
## - pc_1               1    295.87  9192.2 749.47
## - fa_2sq             1    349.34  9245.7 750.55
## - Track_Duration_ms  1    530.56  9426.9 754.16
## - fa_3               1    632.49  9528.9 756.16
## - pc_2sq             1    828.02  9724.4 759.94
## - Track_Popularity   1   1183.07 10079.4 766.61
## - Artist_Followersq  1   1358.68 10255.0 769.82
## - pc_2               1   2647.56 11543.9 791.84
## - Artist_Follower    1   2723.31 11619.7 793.06
## 
## Step:  AIC=744.41
## Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + pc_1 + pc_2 + pc_2sq + 
##     Artist_Follower + Artist_Followersq + Track_Duration_ms + 
##     days_release_orig + Track_Popularity
## 
##                     Df Sum of Sq     RSS    AIC
## <none>                            8945.6 744.41
## - days_release_orig  1     109.0  9054.6 744.66
## - fa_2               1     195.2  9140.7 746.42
## - pc_1               1     271.3  9216.8 747.97
## - fa_2sq             1     338.5  9284.1 749.32
## - Track_Duration_ms  1     569.7  9515.2 753.89
## - fa_3               1     697.0  9642.6 756.37
## - pc_2sq             1     930.7  9876.3 760.82
## - Track_Popularity   1    1153.8 10099.4 764.97
## - Artist_Followersq  1    1321.8 10267.4 768.04
## - Artist_Follower    1    2686.6 11632.1 791.26
## - pc_2               1    3190.0 12135.5 799.13
summary(lmb)
## 
## Call:
## lm(formula = Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + pc_1 + 
##     pc_2 + pc_2sq + Artist_Follower + Artist_Followersq + Track_Duration_ms + 
##     days_release_orig + Track_Popularity, data = x_noout)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -23.0380  -5.2954   0.3417   4.5413  17.3011 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        5.987e+01  3.045e+00  19.664  < 2e-16 ***
## fa_2               7.210e+00  3.700e+00   1.948 0.052974 .  
## fa_2sq            -5.659e+00  2.205e+00  -2.566 0.011127 *  
## fa_3               3.769e+00  1.024e+00   3.682 0.000308 ***
## pc_1               9.810e-01  4.271e-01   2.297 0.022806 *  
## pc_2               4.357e+00  5.531e-01   7.877 3.48e-13 ***
## pc_2sq            -1.667e+00  3.918e-01  -4.255 3.41e-05 ***
## Artist_Follower    4.143e+00  5.731e-01   7.229 1.48e-11 ***
## Artist_Followersq -1.614e-01  3.182e-02  -5.071 1.01e-06 ***
## Track_Duration_ms -1.528e-05  4.591e-06  -3.329 0.001065 ** 
## days_release_orig -2.378e-04  1.633e-04  -1.456 0.147127    
## Track_Popularity   1.936e-01  4.086e-02   4.737 4.48e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.17 on 174 degrees of freedom
## Multiple R-squared:  0.7897, Adjusted R-squared:  0.7764 
## F-statistic: 59.41 on 11 and 174 DF,  p-value: < 2.2e-16
# Regularization

sel.var <- c("fa_1", "fa_2", "fa_2sq", "fa_3", "fa_4", "pc_1", "pc_2", "pc_2sq", "Artist_Follower", "Artist_Followersq", "Artist_Popularity", "days_release_orig", "Track_Popularity", "Track_Duration_ms") 

df <- dplyr::select(x_noout, sel.var)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(sel.var)` instead of `sel.var` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
lmfull <- lm(Artist_Popularity~., data=df)
lmback <- step(lmfull)
## Start:  AIC=747.24
## Artist_Popularity ~ fa_1 + fa_2 + fa_2sq + fa_3 + fa_4 + pc_1 + 
##     pc_2 + pc_2sq + Artist_Follower + Artist_Followersq + days_release_orig + 
##     Track_Popularity + Track_Duration_ms
## 
##                     Df Sum of Sq     RSS    AIC
## - fa_4               1      6.65  8896.4 745.38
## - fa_1               1     49.76  8939.5 746.28
## <none>                            8889.7 747.24
## - days_release_orig  1    128.61  9018.3 747.92
## - fa_2               1    195.49  9085.2 749.29
## - pc_1               1    271.47  9161.2 750.84
## - fa_2sq             1    355.61  9245.3 752.54
## - Track_Duration_ms  1    536.66  9426.4 756.15
## - fa_3               1    615.81  9505.5 757.70
## - pc_2sq             1    830.66  9720.4 761.86
## - Track_Popularity   1   1166.70 10056.4 768.18
## - Artist_Followersq  1   1363.69 10253.4 771.79
## - pc_2               1   2572.29 11462.0 792.51
## - Artist_Follower    1   2719.78 11609.5 794.89
## 
## Step:  AIC=745.38
## Artist_Popularity ~ fa_1 + fa_2 + fa_2sq + fa_3 + pc_1 + pc_2 + 
##     pc_2sq + Artist_Follower + Artist_Followersq + days_release_orig + 
##     Track_Popularity + Track_Duration_ms
## 
##                     Df Sum of Sq     RSS    AIC
## - fa_1               1     49.18  8945.6 744.41
## <none>                            8896.4 745.38
## - days_release_orig  1    121.96  9018.3 745.92
## - fa_2               1    193.77  9090.1 747.39
## - pc_1               1    295.87  9192.2 749.47
## - fa_2sq             1    349.34  9245.7 750.55
## - Track_Duration_ms  1    530.56  9426.9 754.16
## - fa_3               1    632.49  9528.9 756.16
## - pc_2sq             1    828.02  9724.4 759.94
## - Track_Popularity   1   1183.07 10079.4 766.61
## - Artist_Followersq  1   1358.68 10255.0 769.82
## - pc_2               1   2647.56 11543.9 791.84
## - Artist_Follower    1   2723.31 11619.7 793.06
## 
## Step:  AIC=744.41
## Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + pc_1 + pc_2 + pc_2sq + 
##     Artist_Follower + Artist_Followersq + days_release_orig + 
##     Track_Popularity + Track_Duration_ms
## 
##                     Df Sum of Sq     RSS    AIC
## <none>                            8945.6 744.41
## - days_release_orig  1     109.0  9054.6 744.66
## - fa_2               1     195.2  9140.7 746.42
## - pc_1               1     271.3  9216.8 747.97
## - fa_2sq             1     338.5  9284.1 749.32
## - Track_Duration_ms  1     569.7  9515.2 753.89
## - fa_3               1     697.0  9642.6 756.37
## - pc_2sq             1     930.7  9876.3 760.82
## - Track_Popularity   1    1153.8 10099.4 764.97
## - Artist_Followersq  1    1321.8 10267.4 768.04
## - Artist_Follower    1    2686.6 11632.1 791.26
## - pc_2               1    3190.0 12135.5 799.13
library("glmnet")
## Loading required package: Matrix
## Loaded glmnet 3.0-2
xl <- as.matrix(df[,-11])
yl <- df[,11]
set.seed(42)
lmlasso <- cv.glmnet(xl, yl, alpha =1) # cross-validation finds the value of lambda which minimizes MSE, alpha = 1 for LASSO
plot(lmlasso) # displays the bias-variance trade-off (MSE = variance + bias^2)

best.lambda <- lmlasso$lambda.min
lmlasso$lambda.1se
## [1] 0.3361553
cl <- list(coef(lmlasso, s="lambda.min")[,1], coef(lmlasso, s="lambda.1se")[,1], coef(lmback), coef(lmfull))
cf <- matrix(NA, nrow=dim(xl)[2]+1, ncol=length(cl))
row.names(cf) <- names(lmfull$coefficients)
i <- 1
for (ci in cl) {
  cf[names(ci),i] <- ci[names(ci)]
  i <- i+1
}
colnames(cf) <- c("min", "1se", "back", "full")
cf[cf==0] <- NA
cf
##                             min           1se          back          full
## (Intercept)        6.040412e+01  6.114886e+01  5.987096e+01  5.933345e+01
## fa_1              -4.950318e-02            NA            NA -8.893245e-01
## fa_2               5.222430e+00  2.626698e+00  7.209531e+00  7.217918e+00
## fa_2sq            -3.457641e+00 -5.763024e-01 -5.658801e+00 -5.837164e+00
## fa_3               3.323039e+00  2.638256e+00  3.769373e+00  4.505508e+00
## fa_4                         NA            NA            NA  3.733388e-01
## pc_1               8.325534e-01  6.300647e-01  9.809889e-01  1.111732e+00
## pc_2               4.444693e+00  4.581303e+00  4.356624e+00  4.151896e+00
## pc_2sq            -1.666188e+00 -1.671870e+00 -1.667241e+00 -1.600879e+00
## Artist_Follower    3.098932e+00  1.731344e+00  4.143098e+00  4.255461e+00
## Artist_Followersq -1.000969e-01 -2.026752e-02 -1.613611e-01 -1.647535e-01
## days_release_orig -2.334705e-04 -2.257810e-04 -2.377630e-04 -2.659810e-04
## Track_Popularity   1.908254e-01  1.868568e-01  1.935870e-01  1.997832e-01
## Track_Duration_ms -1.648856e-05 -1.812532e-05 -1.528159e-05 -1.495624e-05
# variable importance
glmmod <- glmnet(xl, yl, lambda = best.lambda)
coefs = coef(glmmod)[,1]
coefs = sort(abs(coefs), decreasing = T)
coefs
##       (Intercept)              fa_2              pc_2            fa_2sq 
##      6.041106e+01      5.233869e+00      4.444640e+00      3.461076e+00 
##              fa_3   Artist_Follower            pc_2sq              pc_1 
##      3.325658e+00      3.096916e+00      1.666081e+00      8.321862e-01 
##  Track_Popularity Artist_Followersq              fa_1 days_release_orig 
##      1.907631e-01      9.999550e-02      5.110609e-02      2.337083e-04 
## Track_Duration_ms              fa_4 
##      1.649063e-05      0.000000e+00
coef(lmlasso, s="lambda.1se")
## 14 x 1 sparse Matrix of class "dgCMatrix"
##                               1
## (Intercept)        6.114886e+01
## fa_1               .           
## fa_2               2.626698e+00
## fa_2sq            -5.763024e-01
## fa_3               2.638256e+00
## fa_4               .           
## pc_1               6.300647e-01
## pc_2               4.581303e+00
## pc_2sq            -1.671870e+00
## Artist_Follower    1.731344e+00
## Artist_Followersq -2.026752e-02
## days_release_orig -2.257810e-04
## Track_Popularity   1.868568e-01
## Track_Duration_ms -1.812532e-05
######################
# Performance metrics - full data

performace_metrics <- matrix(nrow = 10, ncol = 6)

RMSE(predict(lmfull, newdata = df), df$Artist_Popularity)
## [1] 6.913336
R2(predict(lmfull, newdata = df), df$Artist_Popularity)
## [1] 0.7910356
performace_metrics[1,1] <- RMSE(predict(lmfull, newdata = df), df$Artist_Popularity)
performace_metrics[1,2] <- R2(predict(lmfull, newdata = df), df$Artist_Popularity)


# x_model <- model.matrix(Artist_Popularity~., x_noout)

library("glmnet")
"Ridge regression"
## [1] "Ridge regression"
set.seed(42)
best.lambda <- cv.glmnet(x = as.matrix(df[,-11]), y = as.matrix(df[,11]), alpha = 0)$lambda.min
ridge.cv <- glmnet(x = as.matrix(df[,-11]), y = as.matrix(df[,11]), alpha = 0, lambda = best.lambda)
# as.numeric(coef(ridge.cv)[,1]) %*% as.numeric(x_model[1,]) equivalent
# predict(ridge.cv, newx = as.matrix(x_noout[1,-8])) equivalent
RMSE(predict(ridge.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
## [1] 7.239088
R2(predict(ridge.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
##             s0
## [1,] 0.7724354
performace_metrics[2,1] <- RMSE(predict(ridge.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
performace_metrics[2,2] <- R2(predict(ridge.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))

"LASSO regression"
## [1] "LASSO regression"
set.seed(42)
best.lambda <- cv.glmnet(x = as.matrix(df[,-11]), y = as.matrix(df[,11]), alpha = 1)$lambda.min
lasso.cv <- glmnet(x = as.matrix(df[,-11]), y = as.matrix(df[,11]), alpha = 1, lambda = best.lambda)
RMSE(predict(lasso.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
## [1] 7.029595
R2(predict(lasso.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
##             s0
## [1,] 0.7847568
performace_metrics[3,1] <- RMSE(predict(lasso.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
performace_metrics[3,2] <- R2(predict(lasso.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))

"Elasticnet regression"
## [1] "Elasticnet regression"
set.seed(42)
best.lambda <- cv.glmnet(x = as.matrix(df[,-11]), y = as.matrix(df[,11]), alpha = 0.5)$lambda.min
elasticnet.cv <- glmnet(x = as.matrix(df[,-11]), y = as.matrix(df[,11]), alpha = 0.5, lambda = best.lambda)
RMSE(predict(elasticnet.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
## [1] 7.043502
R2(predict(elasticnet.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
##             s0
## [1,] 0.7839069
performace_metrics[4,1] <- RMSE(predict(elasticnet.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
performace_metrics[4,2] <- R2(predict(elasticnet.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))

x_full <- model.matrix(Artist_Popularity~., df)[,-1]

ridge_res2 <- as.numeric(df$Artist_Popularity - predict(ridge.cv, x_full))^2
lasso_res2 <- as.numeric(df$Artist_Popularity - predict(lasso.cv, x_full))^2
elasticnet_res2 <- as.numeric(df$Artist_Popularity - predict(elasticnet.cv, x_full))^2

par(mfcol=c(2,2))
barplot((df$Artist_Popularity - predict(lmfull, df))^2)
Axis(side=2)
title(main = paste("LM (full sample): RMSE =",round(RMSE(predict(lmfull, df), df$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(lmfull, df), df$Artist_Popularity), digits = 2)))
barplot(ridge_res2, main = paste("Ridge (full sample): RMSE =",round(RMSE(predict(ridge.cv, x_full), df$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(ridge.cv, x_full), df$Artist_Popularity), digits = 2)))
barplot(lasso_res2, main = paste("LASSO (full sample): RMSE =",round(RMSE(predict(lasso.cv, x_full), df$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(lasso.cv, x_full), df$Artist_Popularity), digits = 2)))
barplot(elasticnet_res2, main = paste("Elasticnet (full sample): RMSE =",round(RMSE(predict(elasticnet.cv, x_full), df$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(elasticnet.cv, x_full), df$Artist_Popularity), digits = 2)))

library("caret")

# Splitting in train and test data

set.seed(42)
idx.train <- createDataPartition(y = df$Artist_Popularity, p = 0.75, list = FALSE)
train <- df[idx.train, ]
test <- df[-idx.train,]

lm_train <- lm(Artist_Popularity~., data=train)
RMSE(predict(lm_train, newdata = train), train$Artist_Popularity)
## [1] 6.835059
R2(predict(lm_train, newdata = train), train$Artist_Popularity)
## [1] 0.7999369
performace_metrics[1,3] <- RMSE(predict(lm_train, newdata = train), train$Artist_Popularity)
performace_metrics[1,4] <- R2(predict(lm_train, newdata = train), train$Artist_Popularity)

RMSE(predict(lm_train, newdata = test), test$Artist_Popularity)
## [1] 9.279663
R2(predict(lm_train, newdata = test), test$Artist_Popularity)
## [1] 0.6328343
performace_metrics[1,5] <- RMSE(predict(lm_train, newdata = test), test$Artist_Popularity)
performace_metrics[1,6] <- R2(predict(lm_train, newdata = test), test$Artist_Popularity)

"Ridge regression"
## [1] "Ridge regression"
set.seed(42)
best.lambda <- cv.glmnet(x = as.matrix(train[,-11]), y = as.matrix(train[,11]), alpha = 0)$lambda.min
ridge.cv <- glmnet(x = as.matrix(train[,-11]), y = as.matrix(train[,11]), alpha = 0, lambda = best.lambda)
# as.numeric(coef(ridge.cv)[,1]) %*% as.numeric(x_model[1,]) equivalent
# predict(ridge.cv, newx = as.matrix(x_noout[1,-8])) equivalent
RMSE(predict(ridge.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
## [1] 7.209575
R2(predict(ridge.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
##             s0
## [1,] 0.7787364
performace_metrics[2,3] <- RMSE(predict(ridge.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
performace_metrics[2,4] <- R2(predict(ridge.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))

RMSE(predict(ridge.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
## [1] 8.186115
R2(predict(ridge.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
##             s0
## [1,] 0.7030654
performace_metrics[2,5] <- RMSE(predict(ridge.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11])) 
performace_metrics[2,6] <- R2(predict(ridge.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))


"LASSO regression"
## [1] "LASSO regression"
set.seed(42)
best.lambda <- cv.glmnet(x = as.matrix(train[,-11]), y = as.matrix(train[,11]), alpha = 1)$lambda.min
lasso.cv <- glmnet(x = as.matrix(train[,-11]), y = as.matrix(train[,11]), alpha = 1, lambda = best.lambda)
RMSE(predict(lasso.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
## [1] 6.940459
R2(predict(lasso.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
##             s0
## [1,] 0.7944228
performace_metrics[3,3] <- RMSE(predict(lasso.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
performace_metrics[3,4] <- R2(predict(lasso.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))

RMSE(predict(lasso.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
## [1] 7.695779
R2(predict(lasso.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
##             s0
## [1,] 0.7335293
performace_metrics[3,5] <- RMSE(predict(lasso.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
performace_metrics[3,6] <- R2(predict(lasso.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
  
"Elasticnet regression"
## [1] "Elasticnet regression"
set.seed(42)
best.lambda <- cv.glmnet(x = as.matrix(train[,-11]), y = as.matrix(train[,11]), alpha = 0.5)$lambda.min
elasticnet.cv <- glmnet(x = as.matrix(train[,-11]), y = as.matrix(train[,11]), alpha = 0.5, lambda = best.lambda)
RMSE(predict(elasticnet.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
## [1] 6.935805
R2(predict(elasticnet.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
##             s0
## [1,] 0.7945423
performace_metrics[4,3] <- RMSE(predict(elasticnet.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
performace_metrics[4,4] <- R2(predict(elasticnet.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))

RMSE(predict(elasticnet.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
## [1] 8.055396
R2(predict(elasticnet.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
##             s0
## [1,] 0.7106164
performace_metrics[4,5] <- RMSE(predict(elasticnet.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
performace_metrics[4,6] <- R2(predict(elasticnet.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
  
# Graphics

# Training data

x_train <- model.matrix(Artist_Popularity~., train)[,-1]

ridge_res2 <- as.numeric(train$Artist_Popularity - predict(ridge.cv, x_train))^2
lasso_res2 <- as.numeric(train$Artist_Popularity - predict(lasso.cv, x_train))^2
elasticnet_res2 <- as.numeric(train$Artist_Popularity - predict(elasticnet.cv, x_train))^2

par(mfcol=c(2,2))
barplot((train$Artist_Popularity - predict(lm_train, train))^2, main = paste("LM (train): RMSE =",round(RMSE(predict(lm_train, train), train$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(lm_train, train), train$Artist_Popularity), digits = 2)))
barplot(ridge_res2, main = paste("Ridge (train): RMSE =",round(RMSE(predict(ridge.cv, x_train), train$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(ridge.cv, x_train), train$Artist_Popularity), digits = 2)))
barplot(lasso_res2, main = paste("LASSO (train): RMSE =",round(RMSE(predict(lasso.cv, x_train), train$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(lasso.cv, x_train), train$Artist_Popularity), digits = 2)))
barplot(elasticnet_res2, main = paste("Elasticnet (train): RMSE =",round(RMSE(predict(elasticnet.cv, x_train), train$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(elasticnet.cv, x_train), train$Artist_Popularity), digits = 2)))

# Test data

x_test <- model.matrix(Artist_Popularity~., test)[,-1]

ridge_res2 <- as.numeric(test$Artist_Popularity - predict(ridge.cv, x_test))^2
lasso_res2 <- as.numeric(test$Artist_Popularity - predict(lasso.cv, x_test))^2
elasticnet_res2 <- as.numeric(test$Artist_Popularity - predict(elasticnet.cv, x_test))^2

par(mfcol=c(2,2))
barplot((test$Artist_Popularity - predict(lm_train, test))^2, main = paste("LM (test): RMSE =",round(RMSE(predict(lm_train, test), test$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(lm_train, test), test$Artist_Popularity), digits = 2)))
barplot(ridge_res2, main = paste("Ridge (test): RMSE =",round(RMSE(predict(ridge.cv, x_test), test$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(ridge.cv, x_test), test$Artist_Popularity), digits = 2)))
barplot(lasso_res2, main = paste("LASSO (test): RMSE =",round(RMSE(predict(lasso.cv, x_test), test$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(lasso.cv, x_test), test$Artist_Popularity), digits = 2)))
barplot(elasticnet_res2, main = paste("Elasticnet (test): RMSE =",round(RMSE(predict(elasticnet.cv, x_test), test$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(elasticnet.cv, x_test), test$Artist_Popularity), digits = 2)))

# Non-parametric and semiparametric regression

# Kernel density estimator

library("np")
## Nonparametric Kernel Methods for Mixed Datatypes (version 0.60-10)
## [vignette("np_faq",package="np") provides answers to frequently asked questions]
## [vignette("np",package="np") an overview]
## [vignette("entropy_np",package="np") an overview of entropy-based methods]
bw   <- npudensbw(~Track_Duration_ms, data=df) # Track_Duration_ms bi-modal?
## 
Multistart 1 of 1 |
Multistart 1 of 1 |
Multistart 1 of 1 |
Multistart 1 of 1 /
Multistart 1 of 1 |
Multistart 1 of 1 |
                   
fhat <- npudens(bw)
fhat
## 
## Density Data: 186 training points, in 1 variable(s)
##               Track_Duration_ms
## Bandwidth(s):          25427.78
## 
## Bandwidth Type: Fixed
## Log Likelihood: -2404.132
## 
## Continuous Kernel Type: Second-Order Gaussian
## No. Continuous Vars.: 1
plot(fhat, main=sprintf("%s with h=%.2f", fhat$pckertype, fhat$bw))
rug(df$Track_Duration_ms)

bw <- npudensbw(~Artist_Popularity+Track_Duration_ms, data=df, bwmethod="normal-reference")
fhat <- npudens(bw)
# plot(fhat) # bi-modal

par(mfcol=c(1,1))
#1) at Track_Duration_ms ~ 1 and Artist_Popularity ~ -1 (meaning longer tracks' artists are less popular)
plot(fhat, view="fixed", phi=10, theta=320, main = "")

par(mfcol=c(1,1))
# 2) at Track_Duration_ms ~ -0.5 and Artist_Popularity ~ 1
plot(fhat, view="fixed", phi=10, theta=155, main = "")

par(mfcol=c(1,1))
nw2 <- npreg(Artist_Popularity~pc_2+Track_Duration_ms, data=df)
## 
Multistart 1 of 2 |
Multistart 1 of 2 |
Multistart 1 of 2 |
Multistart 1 of 2 /
Multistart 1 of 2 -
Multistart 1 of 2 |
Multistart 1 of 2 |
Multistart 2 of 2 |
Multistart 2 of 2 |
Multistart 2 of 2 /
Multistart 2 of 2 -
Multistart 2 of 2 |
Multistart 2 of 2 |
                   
summary(nw2)
## 
## Regression Data: 186 training points, in 2 variable(s)
##                    pc_2 Track_Duration_ms
## Bandwidth(s): 0.6558287          64051.76
## 
## Kernel Regression Estimator: Local-Constant
## Bandwidth Type: Fixed
## Residual standard error: 9.136543
## R-squared: 0.6351856
## 
## Continuous Kernel Type: Second-Order Gaussian
## No. Continuous Explanatory Vars.: 2
# plot(nw2)

# Nadaraya-Watson estimator

# bw <- npregbw(Artist_Popularity~Track_Duration_ms, data=df)
# mhat <- npreg(bw)
# main <- sprintf("%s with h=%.2f", mhat$pckertype, mhat$bw)
# plot(df$Artist_Popularity, df$Track_Duration_ms, pch=19, cex=0.3, main=main)
# ind <- order(df$Track_Duration_ms)
# xs  <- cbind(df$Track_Duration_ms, fitted(mhat))[ind,]
# lines(xs[,1], xs[,2], col="red", lwd=2)
# rug(df$Track_Duration_ms)

# bw <- npregbw(Artist_Popularity~Track_Duration_ms+Track_Popularity, data=df)
# mhat <- npreg(bw)
# plot(mhat)


plotContour <- function (model, data, n=30) {
  mf <- model.frame(model$terms, data)
  mc <- lapply(as.list(mf[,-1]), pretty, n=n)
  zc <- predict(model, expand.grid(mc))
  dim(zc) <- sapply(mc, length)
  r2 <- 1-var(residuals(model))/var(mf[,1])
  contour(mc[[1]], mc[[2]], zc, xlab=names(mf)[2], ylab=names(mf)[3],
            main=sprintf("R^2=%.3f", r2))
  cc <- gray(0.75-0.75*(mf[,1]-min(mf[,1]))/(max(mf[,1])-min(mf[,1])))
  points(mf[,2], mf[,3], pch=19, cex=0.5, col=cc)
  }


model <- lm(Artist_Popularity~pc_1 + pc_2, data=df)
par(mfrow=c(1,1))
plotContour(model, df)

# Partial linear model (mixing linear and spline terms)

library("mgcv")
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:ryouready':
## 
##     intervals
## The following object is masked from 'package:dplyr':
## 
##     collapse
## This is mgcv 1.8-31. For overview type 'help("mgcv-package")'.
## 
## Attaching package: 'mgcv'
## The following object is masked from 'package:mclust':
## 
##     mvn
## The following object is masked from 'package:MVN':
## 
##     mvn
model <- gam(Artist_Popularity~s(Track_Popularity)+Track_Duration_ms, data=df)
# plotContour(model, df$Track_Popularity, df$Track_Duration_ms, df$Artist_Popularity)
# plot(model)

# Additive model

library("gam")
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.16.1
## 
## Attaching package: 'gam'
## The following objects are masked from 'package:mgcv':
## 
##     gam, gam.control, gam.fit, s
am1 <- gam(Artist_Popularity ~ s(fa_1) + s(fa_2) + s(fa_3) + s(fa_4) + s(pc_1) + s(pc_2) + s(Artist_Follower) + s(days_release_orig) + s(Track_Popularity) + s(Track_Duration_ms) + s(fa_2sq) + s(pc_2sq) + s(Artist_Followersq), data=df)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
print(am1)
## Call:
## gam(formula = Artist_Popularity ~ s(fa_1) + s(fa_2) + s(fa_3) + 
##     s(fa_4) + s(pc_1) + s(pc_2) + s(Artist_Follower) + s(days_release_orig) + 
##     s(Track_Popularity) + s(Track_Duration_ms) + s(fa_2sq) + 
##     s(pc_2sq) + s(Artist_Followersq), data = df)
## 
## Degrees of Freedom: 185 total; 76.48856 Residual
## Residual Deviance: 2110.514
par(mfrow=c(3,4))
plot(am1)

plot(fitted(am1), residuals(am1))
lines(lowess(fitted(am1), residuals(am1)), col="red", lwd=2)
n <- nrow(df)
1-sum(residuals(am1)^2)/((n-1)*var(df$Artist_Popularity))
## [1] 0.9503897
performace_metrics[5,1] <- RMSE(predict(am1, newdata = df), df$Artist_Popularity)
performace_metrics[5,2] <- R2(predict(am1, newdata = df), df$Artist_Popularity)

am1_train <- gam(Artist_Popularity ~ s(fa_1) + s(fa_2) + s(fa_3) + s(fa_4) + s(pc_1) + s(pc_2) + s(Artist_Follower) + s(days_release_orig) + s(Track_Popularity) + s(Track_Duration_ms) + s(fa_2sq) + s(pc_2sq) + s(Artist_Followersq), data=train)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
performace_metrics[5,3] <- RMSE(predict(am1_train, newdata = train), train$Artist_Popularity)
performace_metrics[5,4] <- R2(predict(am1_train, newdata = train), train$Artist_Popularity)

performace_metrics[5,5] <- RMSE(predict(am1_train, newdata = test), test$Artist_Popularity)
performace_metrics[5,6] <- R2(predict(am1_train, newdata = test), test$Artist_Popularity)


# Single index model

sim <- npindex(Artist_Popularity ~ fa_1 + fa_2 + fa_3 + fa_4 + pc_1 + pc_2 + Artist_Follower + days_release_orig + Track_Popularity + Track_Duration_ms + fa_2sq + pc_2sq + Artist_Followersq, data=df)
## Multistart 1 of 5...Multistart 2 of 5...Multistart 3 of 5...Multistart 4 of 5...Multistart 5 of 5...                    
summary(sim)
## 
## Single Index Model
## Regression Data: 186 training points, in 13 variable(s)
## 
##       fa_1     fa_2       fa_3     fa_4      pc_1     pc_2 Artist_Follower
## Beta:    1 7.140808 -0.9481839 1.320797 0.3963661 1.316466        10.62225
##       days_release_orig Track_Popularity Track_Duration_ms    fa_2sq     pc_2sq
## Beta:     -4.818576e-05       0.08091355     -7.782318e-06 -7.679957 -0.8994251
##       Artist_Followersq
## Beta:        -0.1862698
## Bandwidth: 0.8377536
## Kernel Regression Estimator: Local-Constant
## 
## Residual standard error: 5.506267
## R-squared: 0.8675706
## 
## Continuous Kernel Type: Second-Order Gaussian
## No. Continuous Explanatory Vars.: 1
performace_metrics[6,1] <- RMSE(predict(sim, newdata = df), df$Artist_Popularity)
performace_metrics[6,2] <- R2(predict(sim, newdata = df), df$Artist_Popularity)

sim_train <- npindex(Artist_Popularity ~ fa_1 + fa_2 + fa_3 + fa_4 + pc_1 + pc_2 + Artist_Follower + days_release_orig + Track_Popularity + Track_Duration_ms + fa_2sq + pc_2sq + Artist_Followersq, data=train)
## Multistart 1 of 5...Multistart 2 of 5...Multistart 3 of 5...Multistart 4 of 5...Multistart 5 of 5...                    
performace_metrics[6,3] <- RMSE(predict(sim_train, newdata = train), train$Artist_Popularity)
performace_metrics[6,4] <- R2(predict(sim_train, newdata = train), train$Artist_Popularity)

performace_metrics[6,5] <- RMSE(predict(sim_train, newdata = test), test$Artist_Popularity)
performace_metrics[6,6] <- R2(predict(sim_train, newdata = test), test$Artist_Popularity)

par(mfrow=c(3,4))

plot(sim)

plot(fitted(sim), residuals(sim))
lines(lowess(fitted(sim), residuals(sim)), col="red", lwd=2)

n <- nrow(df)
1-sum(residuals(sim)^2)/((n-1)*var(df$Artist_Popularity))
## [1] 0.8674403
# Projection Pursuit Regression

ppr <- ppr(Artist_Popularity ~ fa_1 + fa_2 + fa_3 + fa_4 + pc_1 + pc_2 + Artist_Follower + days_release_orig + Track_Popularity + Track_Duration_ms + fa_2sq + pc_2sq + Artist_Followersq, data=df, nterm=5)

performace_metrics[7,1] <- RMSE(predict(ppr, newdata = df), df$Artist_Popularity)
performace_metrics[7,2] <- R2(predict(ppr, newdata = df), df$Artist_Popularity)

ppr_train <- ppr(Artist_Popularity ~ fa_1 + fa_2 + fa_3 + fa_4 + pc_1 + pc_2 + Artist_Follower + days_release_orig + Track_Popularity + Track_Duration_ms + fa_2sq + pc_2sq + Artist_Followersq, data=train, nterm=5)

performace_metrics[7,3] <- RMSE(predict(ppr_train, newdata = train), train$Artist_Popularity)
performace_metrics[7,4] <- R2(predict(ppr_train, newdata = train), train$Artist_Popularity)
performace_metrics[7,5] <- RMSE(predict(ppr_train, newdata = test), test$Artist_Popularity)
performace_metrics[7,6] <- R2(predict(ppr_train, newdata = test), test$Artist_Popularity)

summary(ppr)
## Call:
## ppr(formula = Artist_Popularity ~ fa_1 + fa_2 + fa_3 + fa_4 + 
##     pc_1 + pc_2 + Artist_Follower + days_release_orig + Track_Popularity + 
##     Track_Duration_ms + fa_2sq + pc_2sq + Artist_Followersq, 
##     data = df, nterm = 5)
## 
## Goodness of fit:
##  5 terms 
## 1741.435 
## 
## Projection direction vectors ('alpha'):
##                   term 1        term 2        term 3        term 4       
## fa_1              -6.325427e-02  2.081791e-01 -8.112105e-02  4.462995e-01
## fa_2               1.999522e-01  1.988317e-01  3.820633e-01 -1.225150e-01
## fa_3               1.776410e-01 -4.813375e-01  4.625303e-01 -2.615833e-01
## fa_4               9.095299e-02 -7.325872e-01  5.161008e-01 -2.094942e-01
## pc_1               5.912856e-02 -1.634530e-01 -1.398765e-01 -2.846767e-02
## pc_2               7.261656e-02  1.218474e-01 -3.230672e-01 -5.558153e-01
## Artist_Follower    9.472078e-01 -2.009118e-01  1.651405e-01  5.085105e-01
## days_release_orig  5.284337e-07 -2.107916e-05  5.900920e-05  2.729395e-05
## Track_Popularity   6.925853e-03  1.765052e-02 -6.223949e-02  2.066660e-03
## Track_Duration_ms -3.916723e-07  4.233016e-07  2.333658e-06 -2.253623e-06
## fa_2sq            -9.559306e-02 -2.453111e-01  5.941416e-02 -1.868207e-01
## pc_2sq            -7.315117e-03 -7.878193e-02  4.566070e-01 -2.623794e-01
## Artist_Followersq -3.126859e-02  1.152058e-02 -7.044398e-03 -3.768289e-02
##                   term 5       
## fa_1              -6.211143e-01
## fa_2              -1.261187e-01
## fa_3              -2.121784e-01
## fa_4               4.149667e-01
## pc_1              -2.597611e-01
## pc_2               4.152905e-01
## Artist_Follower   -1.891676e-01
## days_release_orig -7.811224e-05
## Track_Popularity  -2.572436e-03
## Track_Duration_ms -2.907082e-06
## fa_2sq            -1.176104e-01
## pc_2sq             3.024768e-01
## Artist_Followersq  6.051800e-03
## 
## Coefficients of ridge terms ('beta'):
##    term 1    term 2    term 3    term 4    term 5 
## 15.436585  3.228392  3.418258  2.647310  2.190618
par(mfrow=c(1,1))
plot(ppr)

plot(fitted(ppr), residuals(ppr))
lines(lowess(fitted(ppr), residuals(ppr)), col="red", lwd=2)

n <- nrow(df)
1-sum(residuals(ppr)^2)/((n-1)*var(df$Artist_Popularity))
## [1] 0.9590653
library("rpart")
set.seed(42)
tr1 <- rpart(Artist_Popularity~fa_1+fa_2+fa_3+fa_4+pc_1+pc_2+Artist_Follower+days_release_orig
             +Track_Popularity+Track_Duration_ms+fa_2sq+pc_2sq+Artist_Followersq, data=df, cp = 0)

tr1
## n= 186 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##   1) root 186 42541.81000 64.03226  
##     2) Artist_Follower< 0.069933 69  6966.63800 49.07246  
##       4) fa_2sq>=0.04873442 44  1927.88600 43.65909  
##         8) Artist_Follower< 0.0063065 24   493.33330 39.33333  
##          16) Track_Popularity< 42.5 7   246.85710 36.14286 *
##          17) Track_Popularity>=42.5 17   145.88240 40.64706 *
##         9) Artist_Follower>=0.0063065 20   446.55000 48.85000  
##          18) Artist_Follower< 0.0181555 7    27.42857 45.71429 *
##          19) Artist_Follower>=0.0181555 13   313.23080 50.53846 *
##       5) fa_2sq< 0.04873442 25  1480.00000 58.60000  
##        10) Track_Duration_ms>=322237 11   247.63640 53.81818 *
##        11) Track_Duration_ms< 322237 14   783.21430 62.35714 *
##     3) Artist_Follower>=0.069933 117 11026.53000 72.85470  
##       6) Artist_Follower< 0.538149 59  2850.67800 66.23729  
##        12) Track_Duration_ms>=327871 16   283.00000 59.25000 *
##        13) Track_Duration_ms< 327871 43  1495.86000 68.83721  
##          26) Track_Popularity< 66 33   762.90910 66.81818  
##            52) fa_1< -0.7142649 12   117.66670 64.16667 *
##            53) fa_1>=-0.7142649 21   512.66670 68.33333  
##             106) Track_Duration_ms>=208616 7    77.71429 65.42857 *
##             107) Track_Duration_ms< 208616 14   346.35710 69.78571 *
##          27) Track_Popularity>=66 10   154.50000 75.50000 *
##       7) Artist_Follower>=0.538149 58  2964.06900 79.58621  
##        14) Artist_Follower< 2.122275 41  1084.04900 76.26829  
##          28) Track_Popularity< 59 25   470.96000 73.96000  
##            56) Artist_Follower< 0.779791 11   144.54550 71.36364 *
##            57) Artist_Follower>=0.779791 14   194.00000 76.00000 *
##          29) Track_Popularity>=59 16   271.75000 79.87500 *
##        15) Artist_Follower>=2.122275 17   340.11760 87.58824 *
par(mfrow=c(1,1))
plot(tr1)
text(tr1)

# summary(tr1)

library("rpart.plot")
rpart.plot(tr1, cex = 0.7)

n <- nrow(x_noout)
1-sum(residuals(tr1)^2)/((n-1)*var(df$Artist_Popularity))
## [1] 0.9131701
RMSE(predict(tr1, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
## [1] 4.45642
R2(predict(tr1, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
##           [,1]
## [1,] 0.9131701
performace_metrics[8,1] <- RMSE(predict(tr1, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
performace_metrics[8,2] <- R2(predict(tr1, newx = as.matrix(df[,-11])), as.matrix(df[,11]))

par(mfrow=c(1,1))
printcp(tr1)
## 
## Regression tree:
## rpart(formula = Artist_Popularity ~ fa_1 + fa_2 + fa_3 + fa_4 + 
##     pc_1 + pc_2 + Artist_Follower + days_release_orig + Track_Popularity + 
##     Track_Duration_ms + fa_2sq + pc_2sq + Artist_Followersq, 
##     data = df, cp = 0)
## 
## Variables actually used in tree construction:
## [1] Artist_Follower   fa_1              fa_2sq            Track_Duration_ms
## [5] Track_Popularity 
## 
## Root node error: 42542/186 = 228.72
## 
## n= 186 
## 
##           CP nsplit rel error  xerror     xstd
## 1  0.5770474      0  1.000000 1.00653 0.083049
## 2  0.1225097      1  0.422953 0.48655 0.048436
## 3  0.0836530      2  0.300443 0.40487 0.043226
## 4  0.0361974      3  0.216790 0.31837 0.031836
## 5  0.0251945      4  0.180592 0.25499 0.027825
## 6  0.0232243      5  0.155398 0.25364 0.027712
## 7  0.0135972      6  0.132174 0.22116 0.026618
## 8  0.0105578      7  0.118577 0.20613 0.021909
## 9  0.0080236      8  0.108019 0.20183 0.021094
## 10 0.0031164      9  0.099995 0.18382 0.019763
## 11 0.0031126     10  0.096879 0.17620 0.018307
## 12 0.0024891     11  0.093766 0.17494 0.018228
## 13 0.0023646     12  0.091277 0.17700 0.018198
## 14 0.0020825     13  0.088912 0.17707 0.018191
## 15 0.0000000     14  0.086830 0.17756 0.018387
plotcp(tr1)

tr1$cptable
##             CP nsplit  rel error    xerror       xstd
## 1  0.577047401      0 1.00000000 1.0065265 0.08304941
## 2  0.122509677      1 0.42295260 0.4865458 0.04843559
## 3  0.083653037      2 0.30044292 0.4048674 0.04322625
## 4  0.036197394      3 0.21678988 0.3183658 0.03183584
## 5  0.025194452      4 0.18059249 0.2549941 0.02782496
## 6  0.023224285      5 0.15539804 0.2536358 0.02771156
## 7  0.013597245      6 0.13217375 0.2211631 0.02661788
## 8  0.010557834      7 0.11857651 0.2061339 0.02190935
## 9  0.008023608      8 0.10801867 0.2018287 0.02109449
## 10 0.003116364      9 0.09999507 0.1838249 0.01976306
## 11 0.003112575     10 0.09687870 0.1761958 0.01830748
## 12 0.002489096     11 0.09376613 0.1749360 0.01822793
## 13 0.002364588     12 0.09127703 0.1770043 0.01819798
## 14 0.002082545     13 0.08891244 0.1770745 0.01819139
## 15 0.000000000     14 0.08682990 0.1775587 0.01838699
best.cp <- tr1$cptable[which.min(tr1$cptable[,"xerror"]),"CP"]
best.cp
## [1] 0.002489096
dt_pruned <- prune(tr1, cp=best.cp)

1-sum(residuals(dt_pruned)^2)/((n-1)*var(df$Artist_Popularity))
## [1] 0.9062339
RMSE(predict(dt_pruned, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
## [1] 4.630997
R2(predict(dt_pruned, newx = as.matrix(df[,-11])), as.matrix(df[,11])) # lower R^2 after pruning
##           [,1]
## [1,] 0.9062339
library("caret")

# DT CV on full data

caret.control <- trainControl(method = "repeatedcv",
                              number = 10,
                              repeats = 3)
set.seed(42)
rpart.cv <- caret::train(Artist_Popularity ~ ., 
                         data = df,
                         method = "rpart",
                         trControl = caret.control,
                         tuneLength = 15)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
rpart.cv
## CART 
## 
## 186 samples
##  13 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 168, 167, 168, 168, 167, 167, ... 
## Resampling results across tuning parameters:
## 
##   cp           RMSE       Rsquared   MAE      
##   0.000000000   6.074712  0.8414403   4.810231
##   0.002082545   6.068751  0.8423901   4.800191
##   0.002364588   6.052218  0.8437084   4.782182
##   0.002489096   6.053917  0.8434435   4.782970
##   0.003112575   6.044344  0.8438105   4.745985
##   0.003116364   6.044344  0.8438105   4.745985
##   0.008023608   6.226200  0.8325732   4.866355
##   0.010557834   6.403122  0.8199671   5.034487
##   0.013597245   6.553546  0.8110770   5.189022
##   0.023224285   7.292639  0.7677304   5.825220
##   0.025194452   7.438172  0.7583776   5.953699
##   0.036197394   8.040694  0.7224670   6.491552
##   0.083653037   8.679390  0.6818844   7.072580
##   0.122509677   9.979465  0.5780608   8.139785
##   0.577047401  13.444956  0.4565212  10.949275
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.003116364.
rpart.best.cv <- rpart.cv$finalModel
rpart.best.cv
## n= 186 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 186 42541.8100 64.03226  
##    2) Artist_Follower< 0.069933 69  6966.6380 49.07246  
##      4) fa_2sq>=0.04873442 44  1927.8860 43.65909  
##        8) Artist_Follower< 0.0063065 24   493.3333 39.33333 *
##        9) Artist_Follower>=0.0063065 20   446.5500 48.85000 *
##      5) fa_2sq< 0.04873442 25  1480.0000 58.60000  
##       10) Track_Duration_ms>=322237 11   247.6364 53.81818 *
##       11) Track_Duration_ms< 322237 14   783.2143 62.35714 *
##    3) Artist_Follower>=0.069933 117 11026.5300 72.85470  
##      6) Artist_Follower< 0.538149 59  2850.6780 66.23729  
##       12) Track_Duration_ms>=327871 16   283.0000 59.25000 *
##       13) Track_Duration_ms< 327871 43  1495.8600 68.83721  
##         26) Track_Popularity< 66 33   762.9091 66.81818 *
##         27) Track_Popularity>=66 10   154.5000 75.50000 *
##      7) Artist_Follower>=0.538149 58  2964.0690 79.58621  
##       14) Artist_Follower< 2.122275 41  1084.0490 76.26829  
##         28) Track_Popularity< 59 25   470.9600 73.96000 *
##         29) Track_Popularity>=59 16   271.7500 79.87500 *
##       15) Artist_Follower>=2.122275 17   340.1176 87.58824 *
rpart.plot(rpart.best.cv)

RMSE(predict(rpart.best.cv, newdata = df), df$Artist_Popularity) # corresponds exactly to the pruned tree solution
## [1] 4.782344
R2(predict(rpart.best.cv, newdata = df), df$Artist_Popularity) # corresponds exactly to the pruned tree solution
## [1] 0.9000049
# DT CV on split data

set.seed(42)
idx.train <- createDataPartition(y = df$Artist_Popularity, p = 0.75, list = FALSE)
train.data <- df[idx.train, ]
test.data <- df[-idx.train,]

caret.control <- trainControl(method = "repeatedcv",
                              number = 10,
                              repeats = 3)
set.seed(42)
rpart.cv <- caret::train(Artist_Popularity ~ ., 
                  data = train.data,
                  method = "rpart",
                  trControl = caret.control,
                  tuneLength = 15)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
rpart.cv
## CART 
## 
## 142 samples
##  13 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 128, 127, 128, 128, 127, 128, ... 
## Resampling results across tuning parameters:
## 
##   cp          RMSE       Rsquared   MAE      
##   0.00000000   6.262619  0.8356051   4.945561
##   0.03893783   7.948876  0.7357777   6.587601
##   0.07787565   8.733659  0.6826962   7.209211
##   0.11681348   9.340591  0.6418864   7.682783
##   0.15575130  10.167727  0.5758572   8.268144
##   0.19468913  11.032049  0.5094994   9.065660
##   0.23362695  11.336369  0.4881129   9.306610
##   0.27256478  11.336369  0.4881129   9.306610
##   0.31150261  11.336369  0.4881129   9.306610
##   0.35044043  11.336369  0.4881129   9.306610
##   0.38937826  11.336369  0.4881129   9.306610
##   0.42831608  11.336369  0.4881129   9.306610
##   0.46725391  11.336369  0.4881129   9.306610
##   0.50619173  11.336369  0.4881129   9.306610
##   0.54512956  13.737829  0.3936640  11.451762
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.
rpart.best.cv <- rpart.cv$finalModel
rpart.best.cv
## n= 142 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 142 33159.33000 64.23239  
##    2) Artist_Follower< 0.2779545 84 11318.29000 54.85714  
##      4) Artist_Follower< 0.008695 24   756.62500 40.87500  
##        8) Track_Popularity< 48.5 15   399.73330 38.53333 *
##        9) Track_Popularity>=48.5 9   137.55560 44.77778 *
##      5) Artist_Follower>=0.008695 60  3992.85000 60.45000  
##       10) pc_1< 1.01413 40  1397.97500 56.52500  
##         20) Artist_Follower< 0.0946055 26   814.96150 54.03846  
##           40) fa_2< -0.2211121 10   130.50000 49.50000 *
##           41) fa_2>=-0.2211121 16   349.75000 56.87500 *
##         21) Artist_Follower>=0.0946055 14   123.71430 61.14286 *
##       11) pc_1>=1.01413 20   746.20000 68.30000  
##         22) Track_Popularity< 54.5 7    75.42857 63.71429 *
##         23) Track_Popularity>=54.5 13   444.30770 70.76923 *
##    3) Artist_Follower>=0.2779545 58  3764.91400 77.81034  
##      6) Artist_Follower< 2.122275 46  1593.93500 74.84783  
##       12) Track_Popularity< 68.5 37  1001.29700 73.27027  
##         24) Artist_Follower< 0.5444075 11   250.72730 68.54545 *
##         25) Artist_Follower>=0.5444075 26   401.11540 75.26923  
##           50) pc_2sq>=0.5148601 13   124.92310 73.07692 *
##           51) pc_2sq< 0.5148601 13   151.23080 77.46154 *
##       13) Track_Popularity>=68.5 9   122.00000 81.33333 *
##      7) Artist_Follower>=2.122275 12   219.66670 89.16667 *
rpart.plot(rpart.best.cv)

RMSE(predict(rpart.best.cv, newdata = train.data), train.data$Artist_Popularity)
## [1] 4.220621
R2(predict(rpart.best.cv, newdata = train.data), train.data$Artist_Popularity)
## [1] 0.9237157
performace_metrics[8,3] <- RMSE(predict(rpart.best.cv, newdata = train.data), train.data$Artist_Popularity)
performace_metrics[8,4] <- R2(predict(rpart.best.cv, newdata = train.data), train.data$Artist_Popularity)

RMSE(predict(rpart.best.cv, newdata = test.data), test.data$Artist_Popularity)
## [1] 7.001912
R2(predict(rpart.best.cv, newdata = test.data), test.data$Artist_Popularity)
## [1] 0.808911
performace_metrics[8,5] <- RMSE(predict(rpart.best.cv, newdata = test.data), test.data$Artist_Popularity)
performace_metrics[8,6] <- R2(predict(rpart.best.cv, newdata = test.data), test.data$Artist_Popularity)

#########################################################################

library("randomForest")
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:psych':
## 
##     outlier
## The following object is masked from 'package:dplyr':
## 
##     combine
set.seed(42)
rf <-randomForest(Artist_Popularity~fa_1+fa_2+fa_3+fa_4+pc_1+pc_2+Artist_Follower+days_release_orig
                  +Track_Popularity+Track_Duration_ms+fa_2sq+pc_2sq+Artist_Followersq, data=df, cp = 0, method = 'anova')
summary(rf)
##                 Length Class  Mode     
## call              5    -none- call     
## type              1    -none- character
## predicted       186    -none- numeric  
## mse             500    -none- numeric  
## rsq             500    -none- numeric  
## oob.times       186    -none- numeric  
## importance       13    -none- numeric  
## importanceSD      0    -none- NULL     
## localImportance   0    -none- NULL     
## proximity         0    -none- NULL     
## ntree             1    -none- numeric  
## mtry              1    -none- numeric  
## forest           11    -none- list     
## coefs             0    -none- NULL     
## y               186    -none- numeric  
## test              0    -none- NULL     
## inbag             0    -none- NULL     
## terms             3    terms  call
imp <- importance(rf)
ind <- order(imp, decreasing=T)
imp[ind,]
##   Artist_Follower Artist_Followersq              fa_2 Track_Duration_ms 
##        12415.9483        11284.4988         3553.0032         3522.9291 
##              pc_2  Track_Popularity            fa_2sq            pc_2sq 
##         3490.9451         1639.1507         1388.5078          979.8766 
##              pc_1 days_release_orig              fa_4              fa_3 
##          896.7161          769.5573          713.0933          706.7977 
##              fa_1 
##          455.5893
rf_pred <- predict(rf, df, type="class")
n <- nrow(df)
1-sum(residuals(rf)^2)/((n-1)*var(df$Artist_Popularity))
## [1] 1
RMSE(predict(rf, newdata = df), df$Artist_Popularity)
## [1] 2.114803
R2(predict(rf, newdata = df), df$Artist_Popularity)
## [1] 0.9825223
performace_metrics[9,1] <- RMSE(predict(rf, newdata = df), df$Artist_Popularity)
performace_metrics[9,2] <- R2(predict(rf, newdata = df), df$Artist_Popularity)

# RMSE(predict(rf, newdata = train.data), train.data$Artist_Popularity)
# R2(predict(rf, newdata = train.data), train.data$Artist_Popularity)

# RMSE(predict(rf, newdata = test.data), test.data$Artist_Popularity)
# R2(predict(rf, newdata = test.data), test.data$Artist_Popularity)


#########################################################################
library("data.table")
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library("mlr")
## Loading required package: ParamHelpers
## 'mlr' is in maintenance mode since July 2019. Future development
## efforts will go into its successor 'mlr3' (<https://mlr3.mlr-org.com>).
## 
## Attaching package: 'mlr'
## The following objects are masked _by_ '.GlobalEnv':
## 
##     f1, rmse
## The following object is masked from 'package:e1071':
## 
##     impute
## The following object is masked from 'package:caret':
## 
##     train
library("parallel")

rf_train <- as.data.table(train)
rf_test <- as.data.table(test)

task <- makeRegrTask(data = train, target = "Artist_Popularity")

rf_learner <- makeLearner("regr.randomForest", 
                          predict.type = "response")

rf.parms <- makeParamSet(
  # The recommendation for mtry by Breiman is squareroot number of columns
  makeDiscreteParam("mtry", values = c(2,3,4,5,6)), # Number of features selected at each node, smaller -> faster
  makeDiscreteParam("sampsize", values =  c(30, 50, 70)), # bootstrap sample size, smaller -> faster
  makeDiscreteParam("ntree", values = c(10,30,50,100, 500, 1000)) # Number of tree, smaller -> faster
) 

tuneControl <- makeTuneControlGrid(resolution = 3, tune.threshold = FALSE)

rdesc <- makeResampleDesc(method = "CV", iters = 5, stratify = FALSE)

library("parallelMap")

parallelStartSocket(4, level = "mlr.tuneParams")
## Starting parallelization in mode=socket with cpus=4.
#tuning <- tuneParams(rf_learner, task = task, resampling = rdesc,
#                     par.set = rf.parms, control = tuneControl, measures = mlr::rmse)

# https://stackoverflow.com/questions/51333410/mlr-why-does-reproducibility-of-hyperparameter-tuning-fail-using-parallelizatio

suppressMessages({

  set.seed(123456, "L'Ecuyer-CMRG")
  clusterSetRNGStream(iseed = 123456)
  tuning <- tuneParams(rf_learner, task = task, resampling = rdesc,
                     par.set = rf.parms, control = tuneControl, measures = mlr::rmse)
  
})

parallelStop()
## Stopped parallelization. All cleaned up.
tuning$x
## $mtry
## [1] 5
## 
## $sampsize
## [1] 70
## 
## $ntree
## [1] 1000
tuning_results <- generateHyperParsEffectData(tuning, partial.dep = TRUE)

tuning_results$data
tapply(tuning_results$data$rmse.test.rmse, INDEX = c(tuning_results$data$mtry), mean)
##        2        3        4        5        6 
## 6.276888 5.882133 5.681335 5.635646 5.688866
rf_tuned <- setHyperPars(rf_learner, par.vals = tuning$x)
rf_model <- mlr::train(rf_tuned, task = task)
rf_pred_train <- predict(rf_model, newdata = train, type = "response")
rf_pred_test <- predict(rf_model, newdata = test, type = "response")
# rf_pred_train$data$response
# rf_pred_test$data$response

RMSE(predict(rf_model, newdata = train, type = "response")$data$response, train$Artist_Popularity)
## [1] 3.370124
R2(predict(rf_model, newdata = train, type = "response")$data$response, train$Artist_Popularity)
## [1] 0.9581589
performace_metrics[9,3] <- RMSE(predict(rf_model, newdata = train, type = "response")$data$response, train$Artist_Popularity)
performace_metrics[9,4] <- R2(predict(rf_model, newdata = train, type = "response")$data$response, train$Artist_Popularity)

RMSE(predict(rf_model, newdata = test, type = "class")$data$response, test$Artist_Popularity)
## [1] 4.740843
R2(predict(rf_model, newdata = test, type = "class")$data$response, test$Artist_Popularity)
## [1] 0.9040584
performace_metrics[9,5] <- RMSE(predict(rf_model, newdata = test, type = "response")$data$response, test$Artist_Popularity)
performace_metrics[9,6] <- R2(predict(rf_model, newdata = test, type = "response")$data$response, test$Artist_Popularity)
library("MASS")
library("nnet")
## 
## Attaching package: 'nnet'
## The following object is masked from 'package:mgcv':
## 
##     multinom
df_z <- scale(as.matrix(df))
train_z <- as.data.frame(df_z[idx.train, ])
test_z <- as.data.frame(df_z[-idx.train,])

mean_mat <- matrix(, nrow = dim(df)[1], ncol = dim(df)[2])
mean_mat[1, ] <- as.numeric(sapply(df, mean, na.rm = TRUE))

library("zoo")
mean_mat <- na.locf(mean_mat)

diag_sd <- diag(sapply(df, sd, na.rm = TRUE))

# Backtransformation scale to orginal

# as.matrix(df_z) %*% diag_sd + mean_mat

task <- makeRegrTask(data = train_z, target = "Artist_Popularity")
nnet <- makeLearner("regr.nnet", predict.type = "response", par.vals = list("trace" = FALSE, "maxit" = 1000,
                                                                           "MaxNWts" = 80000))

nn.parms <- makeParamSet(
  makeDiscreteParam("decay", values = c(0.0001,0.001, 0.01, 0.1)), 
  makeDiscreteParam("size", values = c(2,4,6,8,10,12,14)))

tuneControl <- makeTuneControlGrid(resolution = 3, tune.threshold = FALSE)

rdesc <- makeResampleDesc(method = "CV", iters = 5, stratify = FALSE)


parallelStartSocket(4, level = "mlr.tuneParams")
## Starting parallelization in mode=socket with cpus=4.
# tuning <- tuneParams(nnet, task = task, resampling = rdesc, par.set = nn.parms, control = tuneControl,
#                     measures = mlr::rmse)

suppressMessages({

  set.seed(123456, "L'Ecuyer-CMRG")
  clusterSetRNGStream(iseed = 123456)
  tuning <- tuneParams(nnet, task = task, resampling = rdesc, par.set = nn.parms, control = tuneControl,
                     measures = mlr::rmse)
})

parallelStop()
## Stopped parallelization. All cleaned up.
tuning$x # result with full data set was decay = 1e-04, size = 2, rmse.test.rmse = 0.4336486
## $decay
## [1] 0.01
## 
## $size
## [1] 2
tuning_results <- generateHyperParsEffectData(tuning, partial.dep = FALSE)

plotHyperParsEffect(tuning_results, x = "size", y = "rmse.test.rmse")

plotHyperParsEffect(tuning_results, x = "decay", y = "rmse.test.rmse")

nnet.tuned <- setHyperPars(nnet, par.vals = tuning$x)
nn_model <- mlr::train(nnet.tuned, task = task)

nn_pred_train <- predict(nn_model, newdata = train_z, type = "response")
nn_pred_test <- predict(nn_model, newdata = test_z, type = "response")

RMSE(predict(nn_model, newdata = train_z, type = "response")$data$response, train_z$Artist_Popularity)
## [1] 0.3410186
R2(predict(nn_model, newdata = train_z, type = "response")$data$response, train_z$Artist_Popularity)
## [1] 0.8854886
RMSE(predict(nn_model, newdata = test_z, type = "response")$data$response, test_z$Artist_Popularity)
## [1] 0.4563401
R2(predict(nn_model, newdata = test_z, type = "response")$data$response, test_z$Artist_Popularity)
## [1] 0.8231612
# Scale predictions back to original:

# as.matrix(train_z[, 11]) %*% diag_sd[11,11] + mean_mat[1,11] == train[,11]

RMSE(as.matrix(nn_pred_train$data$response) %*% diag_sd[11,11] + mean_mat[1,11], train$Artist_Popularity)
## [1] 5.171305
R2(as.matrix(nn_pred_train$data$response) %*% diag_sd[11,11] + mean_mat[1,11], train$Artist_Popularity)
##           [,1]
## [1,] 0.8854886
performace_metrics[10,3] <- RMSE(as.matrix(nn_pred_train$data$response) %*% diag_sd[11,11] + mean_mat[1,11], train$Artist_Popularity)
performace_metrics[10,4] <- R2(as.matrix(nn_pred_train$data$response) %*% diag_sd[11,11] + mean_mat[1,11], train$Artist_Popularity)

RMSE(as.matrix(nn_pred_test$data$response) %*% diag_sd[11,11] + mean_mat[1,11], test$Artist_Popularity)
## [1] 6.920074
R2(as.matrix(nn_pred_test$data$response) %*% diag_sd[11,11] + mean_mat[1,11], test$Artist_Popularity)
##           [,1]
## [1,] 0.8231612
performace_metrics[10,5] <- RMSE(as.matrix(nn_pred_test$data$response) %*% diag_sd[11,11] + mean_mat[1,11], test$Artist_Popularity)
performace_metrics[10,6] <- R2(as.matrix(nn_pred_test$data$response) %*% diag_sd[11,11] + mean_mat[1,11], test$Artist_Popularity)
round(performace_metrics, digits = 2)
##       [,1] [,2] [,3] [,4]  [,5] [,6]
##  [1,] 6.91 0.79 6.84 0.80  9.28 0.63
##  [2,] 7.24 0.77 7.21 0.78  8.19 0.70
##  [3,] 7.03 0.78 6.94 0.79  7.70 0.73
##  [4,] 7.04 0.78 6.94 0.79  8.06 0.71
##  [5,] 3.37 0.95 3.32 0.95 53.30 0.22
##  [6,] 5.51 0.87 5.59 0.87  5.93 0.84
##  [7,] 3.06 0.96 3.11 0.96  8.09 0.77
##  [8,] 4.46 0.91 4.22 0.92  7.00 0.81
##  [9,] 2.11 0.98 3.37 0.96  4.74 0.90
## [10,]   NA   NA 5.17 0.89  6.92 0.82
# Classification

df$Charts <- as.factor(x_noout$Charts)
set.seed(42)
rpart <- rpart(data = df,Charts~.,method = 'class')
rpart.plot(rpart)

rf_pred <- predict(rpart, newdata = df, type = "class")
confMat <- table(df$Charts,rf_pred)
confMat # 0: not in charts, 1: in charts
##    rf_pred
##      0  1
##   0 93  0
##   1  7 86
accuracy <- sum(diag(confMat))/sum(confMat)
accuracy
## [1] 0.9623656
# 0.9623656

par(mfrow=c(1,1))
printcp(rpart)
## 
## Classification tree:
## rpart(formula = Charts ~ ., data = df, method = "class")
## 
## Variables actually used in tree construction:
## [1] fa_2              pc_1              Track_Duration_ms
## 
## Root node error: 93/186 = 0.5
## 
## n= 186 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.795699      0  1.000000 1.18280 0.072088
## 2 0.118280      1  0.204301 0.21505 0.045429
## 3 0.010753      2  0.086022 0.10753 0.033076
## 4 0.010000      3  0.075269 0.13978 0.037390
plotcp(rpart)

rpart$cptable
##           CP nsplit  rel error    xerror       xstd
## 1 0.79569892      0 1.00000000 1.1827957 0.07208812
## 2 0.11827957      1 0.20430108 0.2150538 0.04542863
## 3 0.01075269      2 0.08602151 0.1075269 0.03307630
## 4 0.01000000      3 0.07526882 0.1397849 0.03738999
best.cp <- rpart$cptable[which.min(rpart$cptable[,"xerror"]),"CP"]
best.cp
## [1] 0.01075269
set.seed(42)
rpart_pruned <- prune(rpart, cp=best.cp)
rpart.plot(rpart_pruned)

rf_pred <- predict(rpart_pruned, newdata = df, type = "class")
confMat <- table(df$Charts,rf_pred)
confMat # 0: not in charts, 1: in charts
##    rf_pred
##      0  1
##   0 89  4
##   1  4 89
accuracy <- sum(diag(confMat))/sum(confMat)
accuracy
## [1] 0.9569892
# 0.9569892

library("caret")

train.data <- df[idx.train, ]
test.data <- df[-idx.train,]

caret.control <- trainControl(method = "repeatedcv",
                              number = 10,
                              repeats = 3)

set.seed(42)
rpart.cv <- caret::train(Charts ~ ., 
                         data = train.data,
                         method = "rpart",
                         trControl = caret.control,
                         tuneLength = 15)

rpart.cv
## CART 
## 
## 142 samples
##  14 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 127, 128, 127, 127, 128, 129, ... 
## Resampling results across tuning parameters:
## 
##   cp         Accuracy   Kappa    
##   0.0000000  0.8944567  0.7881329
##   0.0575693  0.8852503  0.7690778
##   0.1151386  0.8663126  0.7304763
##   0.1727079  0.8685348  0.7345800
##   0.2302772  0.8685348  0.7345800
##   0.2878465  0.8685348  0.7345800
##   0.3454158  0.8685348  0.7345800
##   0.4029851  0.8685348  0.7345800
##   0.4605544  0.8685348  0.7345800
##   0.5181237  0.8685348  0.7345800
##   0.5756930  0.8685348  0.7345800
##   0.6332623  0.8685348  0.7345800
##   0.6908316  0.8685348  0.7345800
##   0.7484009  0.8685348  0.7345800
##   0.8059701  0.6919536  0.3540940
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.
set.seed(42)
rpart.best.cv <- rpart.cv$finalModel
rpart.best.cv
## n= 142 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 142 67 1 (0.47183099 0.52816901)  
##    2) Track_Duration_ms>=257068 56  1 0 (0.98214286 0.01785714) *
##    3) Track_Duration_ms< 257068 86 12 1 (0.13953488 0.86046512)  
##      6) fa_2< -0.220861 9  1 0 (0.88888889 0.11111111) *
##      7) fa_2>=-0.220861 77  4 1 (0.05194805 0.94805195)  
##       14) pc_1< -0.7482784 7  3 0 (0.57142857 0.42857143) *
##       15) pc_1>=-0.7482784 70  0 1 (0.00000000 1.00000000) *
rpart.plot(rpart.best.cv)

rpart_pred <- predict(rpart.best.cv, newdata = train.data, type = "class")
confMat <- table(train.data$Charts,rpart_pred)
confMat # 0: not in charts, 1: in charts
##    rpart_pred
##      0  1
##   0 67  0
##   1  5 70
accuracy <- sum(diag(confMat))/sum(confMat)
accuracy
## [1] 0.9647887
# 0.9507042

rpart_pred <- predict(rpart.best.cv, newdata = test.data, type = "class")
confMat <- table(test.data$Charts,rpart_pred)
confMat # 0: not in charts, 1: in charts
##    rpart_pred
##      0  1
##   0 26  0
##   1  1 17
accuracy <- sum(diag(confMat))/sum(confMat)
accuracy
## [1] 0.9772727
# 0.9318182

#################################################

# Random forest

rf_train <- train.data
rf_test <- test.data
task <- makeClassifTask(data = rf_train, target = "Charts", positive = "1")
rf_learner <- makeLearner("classif.randomForest", 
                          predict.type = "prob", # prediction type needs to be specified for the learner 
                          par.vals = list("replace" = TRUE, "importance" = FALSE))

rf.parms <- makeParamSet(
  # The recommendation for mtry by Breiman is squareroot number of columns
  makeDiscreteParam("mtry", values = c(2,3,4,5,6)), # Number of features selected at each node, smaller -> faster
  makeDiscreteParam("sampsize", values =  c(30, 50, 70)), # bootstrap sample size, smaller -> faster
  makeDiscreteParam("ntree", values = c(10,30,50,100, 500, 1000)) # Number of tree, smaller -> faster
) 

tuneControl <- makeTuneControlGrid(resolution = 3, tune.threshold = FALSE)

rdesc <- makeResampleDesc(method = "CV", iters = 5, stratify = TRUE)

parallelStartSocket(4, level = "mlr.tuneParams")
## Starting parallelization in mode=socket with cpus=4.
suppressMessages({

  set.seed(123456, "L'Ecuyer-CMRG")
  clusterSetRNGStream(iseed = 123456)
  tuning <- tuneParams(rf_learner, task = task, resampling = rdesc,
                     par.set = rf.parms, control = tuneControl, measures = mlr::auc)
})


parallelStop()
## Stopped parallelization. All cleaned up.
tuning$x
## $mtry
## [1] 2
## 
## $sampsize
## [1] 70
## 
## $ntree
## [1] 30
tuning_results <- generateHyperParsEffectData(tuning, partial.dep = TRUE)

tuning_results$data
tapply(tuning_results$data$auc.test.mean, INDEX = c(tuning_results$data$mtry), mean)
##         2         3         4         5         6 
## 0.9959381 0.9951058 0.9951486 0.9935470 0.9945442
rf_tuned <- setHyperPars(rf_learner, par.vals = tuning$x)
rf_model <- mlr::train(rf_tuned, task = task)
rf_pred <- predict(rf_model, newdata = train.data, type = "class")
# rf_pred$data$response
confMat <- table(train.data$Charts,rf_pred$data$response)
confMat # 0: not in charts, 1: in charts
##    
##      0  1
##   0 67  0
##   1  0 75
accuracy <- sum(diag(confMat))/sum(confMat)
accuracy
## [1] 1
# 1

rf_pred <- predict(rf_model, newdata = test.data, type = "class")
# rf_pred$data$response
confMat <- table(test.data$Charts,rf_pred$data$response)
confMat # 0: not in charts, 1: in charts
##    
##      0  1
##   0 26  0
##   1  0 18
accuracy <- sum(diag(confMat))/sum(confMat)
accuracy
## [1] 1
# 1

# Neural net classification

df$Charts <- x_noout$Charts
train_z$Charts <- df[idx.train, "Charts"]
test_z$Charts <- df[-idx.train, "Charts"]


task <- makeClassifTask(data = train_z, target = "Charts", positive = "1")
nnet <- makeLearner("classif.nnet", predict.type = "prob", par.vals = list("trace" = FALSE, "maxit" = 1000,
                                                                           "MaxNWts" = 80000))

nn.parms <- makeParamSet(
  makeDiscreteParam("decay", values = c(0.0001,0.001, 0.01, 0.1)), 
  makeDiscreteParam("size", values = c(2,4,6,8,10,12,14)))

tuneControl <- makeTuneControlGrid(resolution = 3, tune.threshold = FALSE)

rdesc <- makeResampleDesc(method = "CV", iters = 5, stratify = TRUE)


parallelStartSocket(4, level = "mlr.tuneParams")
## Starting parallelization in mode=socket with cpus=4.
suppressMessages({

  set.seed(123456, "L'Ecuyer-CMRG")
  clusterSetRNGStream(iseed = 123456)
  tuning <- tuneParams(nnet, task = task, resampling = rdesc, par.set = nn.parms, control = tuneControl,
                     measures = mlr::auc)
})

parallelStop()
## Stopped parallelization. All cleaned up.
tuning$x # result with full data set was decay = 1e-04, size = 2, rmse.test.rmse = 0.4336486
## $decay
## [1] 0.1
## 
## $size
## [1] 12
tuning_results <- generateHyperParsEffectData(tuning, partial.dep = FALSE)

plotHyperParsEffect(tuning_results, x = "size", y = "auc.test.mean")

plotHyperParsEffect(tuning_results, x = "decay", y = "auc.test.mean")

nnet.tuned <- setHyperPars(nnet, par.vals = tuning$x)
nn_model <- mlr::train(nnet.tuned, task = task)

model <- mlr::train(nnet.tuned, task = task)

nn_pred_train <- predict(model, newdata = train_z, type = "class")
nn_pred_test <- predict(model, newdata = test_z, type = "class")

confMat <- table(train_z$Charts,nn_pred_train$data$response)
confMat # 0: not in charts, 1: in charts
##    
##      0  1
##   0 67  0
##   1  0 75
accuracy <- sum(diag(confMat))/sum(confMat)
accuracy
## [1] 1
# 1

confMat <- table(test_z$Charts,nn_pred_test$data$response)
confMat # 0: not in charts, 1: in charts
##    
##      0  1
##   0 25  1
##   1  0 18
accuracy <- sum(diag(confMat))/sum(confMat)
accuracy
## [1] 0.9772727
# 1

```